Esempio n. 1
0
 def load_crime_data(folder):
     tests = []
     for i in range(0, 500):
         with open('data/' + folder + '_data/' + str(i) + '.data',
                   'r') as f:
             tests.append(process_text(f.read()))
     return tests
Esempio n. 2
0
def preprocess_records():
    records = get_db_records()
    processed = []

    for i in range(len(records)):
        r = process_text(records[i])
        processed.append(TaggedDocument(r, [i]))

    return processed
Esempio n. 3
0
def preprocess_records():
    records = get_db_records()
    processed = []

    for i in range(len(records)):
        r = process_text(records[i])
        processed.append(r)

    return processed
Esempio n. 4
0
def noncrime_dataset(n, save, load=False):
    def load_dataset():
        with open('nc.data', 'rb') as f:
            X = pickle.load(f)
        np.random.shuffle(X)
        return X

    if load:
        return load_dataset()
    posts = []
    for post in conn.get_noncrime_posts(n):
        if len(process_text(post[0])) > 10:
            posts.append(process_text(post[0]))
    print(posts)
    X = np.stack(posts)
    if save:
        with open('nc.data', 'wb') as f:
            pickle.dump(X, f)
    np.random.shuffle(X)
    return X
Esempio n. 5
0
    def build_dataset(keywords):
        """
        Gets threads that include a specific keyword and creates a command line interface to add them to the dataset or not
        """
        conn = db()

        keyword = "'.*(" + '|'.join(keywords) + ").*'"

        query = '''SELECT "IdThread" FROM "Thread" WHERE LOWER("Heading") ~ ''' + \
            keyword + ''' AND "Site" = 0 AND "NumPosts" < 200'''
        threads = conn.run_query(query)

        length = 0

        print(len(threads))
        for thread in threads:
            thread_id = thread[0]

            posts = conn.get_posts_from_thread(thread_id)

            for post in posts:
                pp = process_text(post[1])
                if len(pp) > 5:
                    print(remove_tags(post[1]))
                    add = input()

                    with open(keywords[0] + "_training.data", 'a+') as f:
                        if add == 'y':
                            f.write(str(post[0]) + '\n')

                    with open(keywords[0] + "_training.data", 'r') as f:
                        length = len(f.readlines())

                    if length > 500:
                        print("All done")
                    print(
                        "=================================================================================="
                        + str(length) + "\n")
Esempio n. 6
0
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Training progress:')
history = model.fit(x_train,
                    y_train,
                    epochs=2,
                    batch_size=64,
                    validation_data=(x_val, y_val))

super_test = []
for thread in [5881918, 1262128, 2804572, 1065115]:
    posts = conn.get_posts_from_thread(thread)
    for p in posts:
        print(process_text(p[1]))
        super_test.append(process_text(p[1]))
super_test.append(
    process_text(
        "The most I've gotten out of one guy is around $450. I kept milking him (started as $30) but then he started asking to vid call me and wouldn't stop. Few days later I acted like my parents caught me and took my phone, I even acted like I'm the e-w***e's father and texted the guy LMAO. He said he was just a friend from high school ***IMG***[https://hackforums.net/images/smilies/hehe.gif]***IMG*** I've already got him to buy the flight tickets in my w****s name. next he's buying our accommodation in Fiji. I'm surprised he's not even indian, legit just a white American Male."
    ))
super_test = np.stack(super_test)
a = np.array([
    process_text(
        "This isnt about crime, in fact I am just writing about rainbows and ponies, I love ponies so much and rainbows are so pretty I just want to see them everyday"
    ),
    process_text(
        "I'm the best in the world, I make so much money ripping people off, buy my, they will make you a lot of money, very very quickly"
    )
])
print(a)
Esempio n. 7
0
 def preprocess_tfidf(self):
     return [process_text(r) for r in get_db_records()]