Beispiel #1
0
def process_records(records):
    """
    Adds results of NLP to records
    """
    start = time.time()
    # print(f'Executing process_records(records), Record number = {len(records)}')
    for i, rec in enumerate(records):
        text_values = [
            rec['title'], rec['link_title'], rec['announce'], rec['uannounce'],
            rec['full-text']
        ]

        # . в начале чтоб удалить глюк библиотеки
        combined_text = '. ' + '. '.join(v
                                         for v in text_values if v is not None)
        o = text_processor.process_text(combined_text, clear=True)
        entities_text = ', '.join(r['name'] for r in o['entities_list'])

        rec['lemmatized_text'] = o['lemmatized_text']
        rec['entities_text'] = entities_text
        rec['entities_grouped'] = json.dumps(o['entities_grouped'],
                                             ensure_ascii=False)
        rec['process_status'] = o['process_status']
        if i % 10 == 0:
            print(f'{i:>15}')
    duration = time.time() - start
    return records, None, duration
Beispiel #2
0
def search():
    text = get_text()
    field = request.args.get('field','lemmatized_text')
    skip = request.args.get('skip', 0)
    limit = request.args.get('limit',20)
    timeout = request.args.get('timeout','5s')
    lemmatize = request.args.get('lemmatize',True)
    from_date = request.args.get('from_date','2000-01-01')
    to_date = request.args.get('to_date','2030-01-01')
    index = request.args.get('index','articles')

    if lemmatize!="false":
        o = text_processor.process_text(text, clear=True)
        text = o.get('lemmatized_text','')
    else:
        logging.warning('NOT lemmatized !!!!')

    search_result = elastic.search(text, skip=skip, limit=limit, field=field, timeout=timeout, from_date=from_date, to_date=to_date, index=index)
    return make_response(search_result)
Beispiel #3
0
c = conn.cursor()
c.execute("SELECT * FROM training")
tr_d = c.fetchall()
c.execute("SELECT * FROM test")
te_d = c.fetchall()
c.execute("SELECT * FROM validation")
va_d = c.fetchall()

tr_x = [d[0] for d in tr_d]
tr_y_ = [sent2onehot(d[2]) for d in tr_d]
te_x = [d[0] for d in te_d]
te_y_ = [sent2onehot(d[2]) for d in te_d]
va_x = [d[0] for d in va_d]
va_y_ = [sent2onehot(d[2]) for d in va_d]

ptr_x = [process_text(x, stop_words) for x in tr_x]
pte_x = [process_text(x, stop_words) for x in te_x]
pva_x = [process_text(x, stop_words) for x in va_x]

vocab_size = 1000
vocab = {}

for text in ptr_x:
    tokens = text.split(' ')
    for t in tokens:
        vocab[t] = vocab.get(t, 0) + 1
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1),
                      reverse=True)[:vocab_size]
features = [v[0] for v in sorted_vocab]

tr_dtmat = word2vec(ptr_x, features)
Beispiel #4
0
    def data_from_db(self):
        """
        get subreddit corpus from database reddit.db

        :return:
        text_matrix: matrix of text in subreddits. rows are subreddits.
        sub_list: list of subreddits included in the matrix
        sub_to_index: dictionary for converting from subreddit name to index in the matrix
        """

        sub_list = []
        text_matrix = []
        unstemmed_text_matrix = []  # used for word cloud later

        connecting_to_db = True

        sql_command = "SELECT subreddit, GROUP_CONCAT(body, ' ') as all_comments FROM comments GROUP BY subreddit"

        while connecting_to_db:
            try:
                print("Connecting to DB.\n")
                pwd = os.getcwd()
                db_conn = sqlite3.connect(pwd + '/../db/reddit.db')
                c = db_conn.cursor()
                results = c.execute(sql_command)

            except sqlite3.OperationalError:
                print("Table does not exist yet. Creating from CSV.\n")
                create_db(db_conn)
                continue

            print("Done.")

            break

        english_stop_words = stopwords.words('english')

        r = praw.Reddit(user_agent='daniel_scraper')

        for i, row in enumerate(list(results)):
            print("Loading subreddit {}: {}....".format(i, row[0]), end="")

            '''
            try:
                if r.get_subreddit(row[0]).subscribers < 50000:
                    print("Done")
                    continue

            except:
                print("Something went wrong. Continuing.")
                continue
            '''

            sub_list.append(row[0].lower())
            text_matrix.append(process_text(row[1], punctuation, english_stop_words))

            unstemmed_text_matrix.append(process_text(row[1], punctuation, english_stop_words, stem=False))

            print("Done")

        sub_to_index = {sub_name: index for sub_name, index in zip(sub_list, range(len(sub_list)))}

        print("Done.\n")

        text_matrix = np.array(text_matrix)
        unstemmed_text_matrix = np.array(unstemmed_text_matrix)

        np.save('unstemmed_text_matrix.npy', unstemmed_text_matrix)
        np.save('text_matrix.npy', text_matrix)
        pickle.dump(sub_list, open("sub_list.p", "wb"))
        pickle.dump(sub_to_index, open("sub_to_index.p", "wb"))

        return text_matrix, sub_list, sub_to_index
Beispiel #5
0
def clear_lemmas_entities():
    text = get_text()
    o = text_processor.process_text(text, clear=True)
    return make_response(o)
Beispiel #6
0
def lemmas_entities():
    text = get_text()
    o = text_processor.process_text(text)
    return make_response(o)