コード例 #1
0
def create_tfidf(oversample=False, description=False):
    print("Reading the data...")

    if oversample:
        df_train = get_oversampled_train()
    else:
        df_train = pd.read_csv("data/train_raw.csv")

    df_test = pd.read_csv("data/test_raw.csv")

    print("Creating the corpus...")
    corpus_train = textacy.Corpus(lang='en',
                                  texts=df_train['description'].tolist())
    corpus_test = textacy.Corpus(lang='en',
                                 texts=df_test['description'].tolist())

    tokenized_docs_train = (doc.to_terms_list(ngrams=1,
                                              named_entities=True,
                                              as_strings=True)
                            for doc in corpus_train)
    tokenized_docs_test = (doc.to_terms_list(ngrams=1,
                                             named_entities=True,
                                             as_strings=True)
                           for doc in corpus_test)

    print("Generating TF-IDF...")
    vectorizer = textacy.Vectorizer(apply_idf=True,
                                    norm="l2",
                                    min_df=4,
                                    max_df=.95)
    tfidf_train = vectorizer.fit_transform(tokenized_docs_train)
    tfidf_test = vectorizer.transform(tokenized_docs_test)

    tfidf_train = pd.DataFrame(tfidf_train.toarray())
    tfidf_test = pd.DataFrame(tfidf_test.toarray())

    if description:
        pd.concat([tfidf_train, df_train['label']],
                  axis=1).to_csv("data/tfidf_train_description.csv",
                                 index=False)
        pd.concat([tfidf_test, df_test['label']],
                  axis=1).to_csv("data/tfidf_test_description.csv",
                                 index=False)
    else:
        pd.concat([tfidf_train, df_train['label']],
                  axis=1).to_csv("data/tfidf_train.csv", index=False)
        pd.concat([tfidf_test, df_test['label']],
                  axis=1).to_csv("data/tfidf_test.csv", index=False)
コード例 #2
0
def create_textacy_corpus(corpus_reader,
                          nlp,
                          tick=utility.noop,
                          strip_tensor=True):
    logger.info('creating corpus (this might take some time)...')
    batch_size = 100
    corpus = textacy.Corpus(nlp)
    document_id = 0
    n_chunk_threshold = 50000
    for filename, text, metadata in corpus_reader:

        metadata = utility.extend(
            metadata, dict(filename=filename, document_id=document_id))

        if len(text) > n_chunk_threshold:
            spacy_doc = textacy.spacier.utils.make_doc_from_text_chunks(
                text, lang=nlp, chunk_size=n_chunk_threshold)
            corpus.add_doc(spacy_doc, metadata)
        else:
            corpus.add_text(text, metadata)

        if strip_tensor:
            for doc in corpus:
                doc.spacy_doc.tensor = None

        document_id += 1
        if document_id % batch_size == 0:
            logger.info('%s documents added...', document_id)
            tick(document_id)

    return corpus
コード例 #3
0
def create_corpus(lang="en_core_web_lg"):
    # nlp = en
    # component = entities.FinancialEntityRecognizer(nlp, entitites._financial_institutions)  # initialise component
    # en.add_pipe(component, before="ner")
    bpd = BlockchainPapersDataset()
    corpus = textacy.Corpus(lang, data=bpd.records())
    return corpus
def create_textacy_corpus(corpus_reader,
                          nlp,
                          tick=utility.noop,
                          n_chunk_threshold=100000):

    corpus = textacy.Corpus(nlp)
    counter = 0

    for filename, document_id, text, metadata in corpus_reader:

        metadata = utility.extend(
            metadata, dict(filename=filename, document_id=document_id))

        if len(text) > n_chunk_threshold:
            doc = textacy.spacier.utils.make_doc_from_text_chunks(
                text, lang=nlp, chunk_size=n_chunk_threshold)
            corpus.add_doc(doc)
            doc._.meta = metadata
        else:
            corpus.add((text, metadata))

        counter += 1
        if counter % 100 == 0:
            logger.info('%s documents added...', counter)
        tick(counter)

    logger.info('Done! %s documents added!', counter)

    return corpus
コード例 #5
0
    def exec_pipeline(self, texts, pipeline_components, normalize_texts=True):
        """
        Starts the NLP pipeline (https://miro.medium.com/max/700/1*tRJU9bFckl0uG5_wTR8Tsw.png) of spaCy defined in the constructor for a corpus.

        Parameters
        ----------
        texts : List[Dict]
            Expects a list of document dicts with their meta data
        pipeline_components : List[str]
            List of analysis types to consider for this pipeline
        normalize : bool, optional
            whether to clean the texts before processing, by default True (should only be false for debugging purposes)
        """

        self.corpus = textacy.Corpus(self.nlp)
        with self.nlp.disable_pipes(*self._remove_unused_components(pipeline_components)):
            if (self.threads == -1):
                if (platform.startswith('win32')):
                    partitions = minibatch(texts, math.ceil(len(texts) / cpu_count()))
                else:
                    from os import sched_getaffinity
                    partitions = minibatch(texts, math.ceil(len(texts) / len(sched_getaffinity(0))))
            else:
                partitions = minibatch(texts, math.ceil(len(texts) / self.threads))
            executor = Parallel(n_jobs=self.threads, require="sharedmem", prefer="threads", verbose=10)
            do = delayed(partial(self._exec_pipeline_for_sub_corpus, normalize_texts))
            tasks = (do(i, batch) for i, batch in enumerate(partitions))
            sub_corpora = executor(tasks)
            self.corpus.add_docs([doc for corpus in sub_corpora for doc in corpus])
コード例 #6
0
def create_textacy_corpus(documents, nlp, tick=utility.noop):
    corpus = textacy.Corpus(nlp)
    for filename, text, metadata in documents:
        corpus.add_text(text, utility.extend(dict(filename=filename),
                                             metadata))
        tick()
    return corpus
コード例 #7
0
def txt_to_corpus(txt_dir, lang=en, txt_extention=".txt"):
    """
    Reads a text directory and puts in a textacy corpus with the filename as metadata
    and adds a docstats_df
    """
    # Get the name of the function - should be decorator for every function
    functionNameAsString = sys._getframe().f_code.co_name
    logging.debug(
        f"Function: {functionNameAsString} -- Loading Text from: {txt_dir}")

    # Get a list of files to get text for
    flpth_gen = textacy.io.utils.get_filepaths(txt_dir,
                                               match_regex=None,
                                               ignore_regex=None,
                                               extension=".txt",
                                               ignore_invisible=True,
                                               recursive=True)

    # Loop throuh the text directory (input), for all the files ending with .txt
    #docs_lst = [dask.delayed(txtfile_to_doc)(flpth, en) for flpth in flpth_gen]
    rec_lst = [txt_to_docrec(flpth) for flpth in flpth_gen]
    #docs = dask.compute(docs_lst)

    # Add docs to a spacy corpus
    crps = textacy.Corpus(en, rec_lst)

    # Calculate stats for each doc in a corpus and make a docstats_df
    crpsStats = ptg.corpus_stats.CorpusStats(crps)

    crps.docstats_df = crpsStats.docstats_df

    logging.debug(f"Function: {functionNameAsString} -- Loaded {crps.n_docs}")

    return crps
コード例 #8
0
def get_topics(speeches, path_to_save, n_topics=10, n_words=10):
    '''
    INPUT: List of cleaned speeches
    OUTPUT: Top n_words for n_topics
    '''
    corpus = textacy.Corpus(('en').decode('ascii', 'ignore'), texts=speeches)

    doc_term_matrix, id2term = textacy.vsm.doc_term_matrix((doc.to_terms_list(ngrams=1,
                                named_entities=False, as_strings=True) for doc in corpus),
                                weighting='tfidf', normalize=True, smooth_idf=True, min_df=2,
                                max_df=0.95)

    model = textacy.tm.TopicModel('nmf', n_topics=n_topics)
    model.fit(doc_term_matrix)
    doc_topic_matrix = model.transform(doc_term_matrix)
    doc_topic_matrix.shape
    topic_dic = {}
    for topic_idx, top_terms in model.top_topic_terms(id2term, top_n=n_words):
        topic_dic['Topic' + ' ' + str(topic_idx)] = top_terms


    model.termite_plot(doc_term_matrix, id2term, topics=-1,  n_terms=25, highlight_topics = [2, 3, 4, 5, 8, 9],
                        sort_terms_by='seriation', save=path_to_save)


    return topic_dic
コード例 #9
0
    def create_corpus(self, fpath):
        """
        Load csv file /code/data_all.csv from disk. Each row is one document.
        It expects first column to be the Text / Body we want to analyse with
        textacy. The rest of the columns are stored as metadata associated
        to each document.

        Returns a textacy.Corpus.
        """

        # read all eea documents from csv file
        eeadocs = textacy.fileio.read.read_csv(fpath)

        # '/code/data_all.csv'
        # use title as "text" to analyse.
        # therefore split title (first column 0) from metadata
        content_stream, metadata_stream = textacy.fileio.split_record_fields(
            eeadocs, 0)

        # create textacy english Corpus
        corpus = textacy.Corpus('en',
                                texts=content_stream,
                                metadatas=metadata_stream)

        return corpus
コード例 #10
0
def load_corpus(filename, lang, document_id='document_id', format='binary'):
    if format == 'binary':
        '''HACK: read docs saved in 'binary' format. NOTICE: textacy patch'''
        docs = textacy_patch.read_spacy_docs(filename,
                                             format=format,
                                             lang=lang)
        corpus = textacy.Corpus(docs=docs, lang=lang)

        #spacy_docs = textacy.io.read_spacy_docs(filename, format=format, lang=lang)
        #first_spacy_doc, spacy_docs = itertoolz.peek(spacy_docs)
        #spacy_lang_meta = first_spacy_doc.user_data['textacy'].pop('spacy_lang_meta')
        #spacy_lang = spacy.util.get_lang_class(spacy_lang_meta['lang'])(vocab=first_spacy_doc.vocab, meta=spacy_lang_meta)
        #for name in spacy_lang_meta['pipeline']:
        #    spacy_lang.add_pipe(spacy_lang.create_pipe(name))
        #return cls(spacy_lang, docs=spacy_docs)

    else:
        corpus = textacy.Corpus.load(filename)

    #for doc in corpus:
    #    user_data = doc.spacy_doc.user_data
    #    user_data['year'] = int(user_data['year']) if 'year' in user_data else 0
    #    doc.metadata.update(user_data)
    #    #metadata = doc.spacy_doc.user_data['textacy']['metadata']
    #    #for x in ['filename', document_id]:
    #    #    if x in metadata.keys():
    #    #        corpus[0].metadata[x] = metadata[x]
    return corpus
コード例 #11
0
def make_corpus(df: pd.DataFrame, col_name: str,
                min_token_count: int) -> textacy.Corpus:
    spacy_records = df[col_name].apply(
        lambda x: textacy.make_spacy_doc(x, lang="en"))
    long_records = [
        record for record in spacy_records if len(record) >= min_token_count
    ]
    corpus = textacy.Corpus("en", data=list(long_records))
    return corpus
コード例 #12
0
 def setUp(self):
     self.spacy_lang = textacy.data.load_spacy('en')
     self.cw = textacy.datasets.CapitolWords()
     self.text = list(self.cw.texts(speaker_name={'Bernie Sanders'}, limit=1))[0]
     self.doc = textacy.Doc(self.text.strip(), lang=self.spacy_lang)
     records = self.cw.records(speaker_name={'Bernie Sanders'}, limit=10)
     text_stream, metadata_stream = textacy.fileio.split_record_fields(
         records, 'text')
     self.corpus = textacy.Corpus(
         self.spacy_lang, texts=text_stream, metadatas=metadata_stream)
コード例 #13
0
ファイル: get_model_data.py プロジェクト: gryBox/AttnGAN
def df_to_corpus(df):
    # Load into textacy to delimit sentences
    img_labels = df.to_dict(orient="records")
    text_stream, metadata_stream = textacy.io.split_records(img_labels, 'RESOURCE')

    # Load english model
    en = en_core_web_sm.load()
    corpus = textacy.Corpus(lang=en, texts=text_stream, metadatas=metadata_stream)
    
    return corpus
コード例 #14
0
def get_stats(speeches):
    '''
    INPUT: List of uncleaned speeches
    OUTPUT: Dataframe of readability statistics
    '''

    corpus = textacy.Corpus(('en').decode('utf-8'), texts=speeches)
    speech_stats = []
    for text in corpus:
        speech_stats.append(textacy.text_stats.readability_stats(text))

    return speech_stats
コード例 #15
0
 def _exec_pipeline_for_sub_corpus(self, normalize_texts, batch_id, docs):
     # Internal function to enable multi-threaded pipeline execution
     sub_corpus = textacy.Corpus(self.nlp)
     for doc in docs:
         if doc['text']:
             if normalize_texts:
                 spacy_doc = textacy.make_spacy_doc(
                     (normalize(self.language, doc['text']), {'celex': doc['celex']}), self.nlp)
             else:
                 spacy_doc = textacy.make_spacy_doc(doc['text'], {'celex': doc['celex']}, self.nlp)
             sub_corpus.add_doc(spacy_doc)
     return sub_corpus
コード例 #16
0
def df_to_corpus(df, txt_column='RESOURCE'):
    # Load into textacy to delimit sentences
    img_labels = df.to_dict(orient="records")
    text_stream, metadata_stream = textacy.io.split_records(
        img_labels, txt_column)

    # Load english model

    corpus = textacy.Corpus(lang=en,
                            texts=text_stream,
                            metadatas=metadata_stream)

    return corpus
コード例 #17
0
def createqcorpus(q):
    data = [tuple(x) for x in q.values]
    qcorpus = textacy.Corpus("en") #initialise corpus for question

    for r,*items in data:
        response = " ".join(r.split())
        if len(items) == 1:
            qcorpus.add_text(response, metadata={"ID":str(items[0]),"categories":[], "notes":""})
        elif len(items) == 2:
            qcorpus.add_text(response, metadata={"ID":str(items[0]),"PostID":str(items[1]),"categories":[], "notes":""})
        elif len(items) == 3:
            qcorpus.add_text(response, metadata={"ID":str(items[0]),"PostID":str(items[1]),"ParentID":str(items[2]),"categories":[], "notes":""})

    return qcorpus
コード例 #18
0
 def __init__(self):
     self._min_occurrence_for_topic = 2
     self._common_verbs = 10
     # create an empty corpus
     self._en = textacy.load_spacy_lang('en_core_web_sm', disable=('parser',))
     self._corpus = textacy.Corpus(lang=self._en)
     self._content = None
     self._model = None
     self._numdocs = 0
     self._numtopics = 0
     self._terms = None
     self._doc_term_matrix = None
     self._doc_topic_matrix = None
     self._vectorizer = Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth',
                                   norm='l2', min_df=3, max_df=0.95, max_n_terms=100000)
コード例 #19
0
def createtcorpus(filename, refId):
    # just reading one text for now - use tcorpus.add_texts() when multifile upload implemented
    tcorpus = textacy.Corpus("en")
    text_to_add = textacy.fileio.read.read_file(CORPUS_ROOT + filename)
    tcorpus.add_text(text_to_add)

    origpath = os.path.join(app.config['CORPUS_FOLDER'],str(refId))
    if not os.path.exists(origpath):
        os.makedirs(origpath)

    fname = filename.rsplit('.', 1)[0]
    path = os.path.join(origpath,fname)
    if not os.path.exists(path):
        os.makedirs(path)
    tcorpus.save(path,name=fname,compression="gzip")
    return tcorpus
コード例 #20
0
def process_text(lst, filepath=None, filename=None, compression=None):
    '''
    DESC: Tokenizes and processes lst of strings using textacy. If filepath: saves corpus as pickle to filepath.
    --Input--
        lst: list of strings
        filepath: (str) path to directory where textacy corpus will be saved
        filename: (str) name of pickled textacy corpus
        compression: (str) compression of metadata json ('gzip', 'bz2', 'lzma' or None)
    ----------------------------------
    --Output--
        Returns textacy corpus object, if filepath: saves textacy corpus as pickle
    '''
    corpus = textacy.Corpus('en', texts=lst)
    if filepath:
        corpus.save(filepath, filename, compression)
    return corpus
コード例 #21
0
def truncate_docs_in_daily_corpora(corpora):
    trunc_corpora = []
    for corpus in corpora:
        trunc_corpus = textacy.Corpus('en')
        for doc in corpus:
            first_sents_spans = list(itertools.islice(doc.sents, 2))
            # print(first_sents_spans)
            first_sents = ""
            for span in first_sents_spans:
                first_sents += str(span) + " "
            # print(first_sents)
            trunc_doc = textacy.Doc(first_sents, doc.metadata, 'en')
            # print(trunc_doc)
            trunc_corpus.add_doc(trunc_doc, doc.metadata)
        trunc_corpora.append(trunc_corpus)

    return trunc_corpora
コード例 #22
0
 def process_text(self, filepath=None, filename=None, compression=None):
     '''
     DESC: Tokenizes and processes pandas DataFrame using textacy. If filepath: saves corpus as pickle to filepath.
     --Input--
         filepath: (str) path to directory where textacy corpus will be saved
         filename: (str) name of pickled textacy corpus
         compression: (str) compression of metadata json ('gzip', 'bz2', 'lzma' or None)
     ----------------------------------
     --Output--
         Returns textacy corpus object, if filepath: saves textacy corpus as pickle
     '''
     if len(self.text) == 0:
         self._get_reviews_and_label()
     self.corpus = textacy.Corpus('en')
     self.corpus.add_texts(texts=self.text, batch_size=1000, n_threads=-1)
     if filepath:
         self.corpus.save(filepath, filename, compression)
         print('Saved textacy corpus to filepath.')
     return
コード例 #23
0
def normalise_text(corpus,settings):
    ncontractions = settings['ncontractions']
    lcase = settings['lcase']
    punct = settings['punct']
    nums = settings['nums']

    textlist = [d.text for d in corpus.docs]

    if ncontractions:
        textlist = [textacy.preprocess.unpack_contractions(w) for w in textlist]
        textlist = [re.sub(r"(\b)(nt|n't)", r"not", w) for w in textlist] # hack to deal with standalone n't missing from textacy unpack_contractions after tokenisation

    if lcase:
        textlist = [w.lower() for w in textlist]
    if punct:
        textlist = [strip_punctuation(w,settings) for w in textlist]
    if nums:
        textlist = [textacy.preprocess.replace_numbers(w,"") for w in textlist]
    ncorpus = textacy.Corpus("en")
    for t in textlist:
        ncorpus.add_text(t)
    return ncorpus
コード例 #24
0
def pipe01(limit=0):

    # Define textacy doc preprocessing
    textacy_preprocessor = lambda text: textacy.preprocess.preprocess_text(
        text,
        no_contractions=True,
        no_numbers=True,
        no_emails=True,
        no_currency_symbols=True,
        lowercase=True)
    # Define nlp pipeline
    nlp = spacy.load("en", add_vectors=False)
    nlp.pipeline = [nlp.tagger, nlp.parser]

    #
    handle = article_handles.articleObjectPhysOrg

    # Do all the other things
    with utils.mongo_open('BlogData', 'PhysOrg') as conn:
        article_iter = conn.query(conditions={
            'html': {
                '$ne': 'None'
            },
            'url': {
                '$regex': '^http://phys.org/news/'
            }
        },
                                  limit=limit)
        content_stream, metadata_stream = utils.proc_art_iterator(
            handle=handle, mdb_iterator=article_iter, tpp=textacy_preprocessor)
        corpus = textacy.Corpus(lang=nlp,
                                texts=content_stream,
                                metadatas=metadata_stream)

    # Return the data
    return (corpus)
コード例 #25
0
import textacy
import spacy.en
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle

cw = textacy.corpora.CapitolWords()
docs = cw.records(date_range=('1996-01-01', '2016-12-31'))
content_stream, metadata_stream = textacy.fileio.split_record_fields(
    docs, 'text', 'speaker_name')
corpus = textacy.Corpus(('en').decode('utf-8', 'ignore'),
                        texts=content_stream,
                        metadatas=metadata_stream)

#Dataframe of the speaker and speeches created from the corpus
df = pd.read_pickle('speeches')

drop_words = [
    'objection', 'american', 'people', 'gentlewoman', 'gentleman', 'minute',
    'distinguished', 'yea', 'desk', 'vote', 'internship', 'unanimous',
    'consent', 'quorum', 'previous', 'session', 'amendment', 'read',
    'immediate', 'consideration', 'senator', 'congress', 'house', 'rollcall',
    'floor', 'desire', 'nay', 'present', 'ask', 'rescind', 'order',
    'recognize', 'yield', 'question', 'authorize', 'meet', 'proceed', 'motion',
    'pending', 'set', 'table', 'lie', 'president', 'speaker', 'appeal',
    'ruling'
]
コード例 #26
0
def start_cluster_batch():
    topic_list_query = "SELECT * from sharelock.topic_list"
    topic_rows = session.execute(topic_list_query)
    topic_row_list = list(topic_rows)
    topic_frames = pd.DataFrame(topic_row_list)
    for idx, frame in topic_frames.iterrows():
        topic = frame['topic']
        category = frame['category']
        query = "SELECT * from sharelock.active_tweets where topic='" + topic + "'order by inserted_at desc limit 30"
        rows = session.execute(query)
        ent_dict = {}
        sorted_json = {}

        row_list = []
        for row in rows:
            xd = json.loads(row.tweet_batch)
            row_list = row_list + xd

        sorted_result = df = pd.DataFrame(data=row_list)
        sorted_result.set_index('tweet_id')
        sorted_result = sorted_result.drop_duplicates(subset='tweet_id',
                                                      keep='first')

        # Clean results by dropping items with similarity score o.98 or higher

        sorted_result['tweet_tokens'] = sorted_result['tweet_text'].apply(nlp)
        sorted_result['tweet_clean_text'] = sorted_result['tweet_text'].apply(
            get_cleaned_text)
        sorted_result['tweet_clean_tokens'] = sorted_result[
            'tweet_clean_text'].apply(nlp)
        sorted_result = remove_duplicate_posts(sorted_result)

        corpus = textacy.Corpus(lang="en_core_web_lg",
                                texts=list(sorted_result['tweet_text']),
                                metadatas=list(sorted_result['tweet_id']))

        terms_list = (doc.to_terms_list(ngrams=(1, 2, 3),
                                        named_entities=True,
                                        normalize=u'lemma',
                                        lemmatize=True,
                                        lowercase=True,
                                        as_strings=True,
                                        filter_stops=True,
                                        filter_punct=True,
                                        min_freq=1,
                                        exclude_pos=("PRON", "X", "PUNCT",
                                                     "SYM")) for doc in corpus)

        vectorizer = textacy.Vectorizer(tf_type='linear',
                                        apply_idf=True,
                                        idf_type='smooth')

        textacy.text_utils.clean_terms(terms_list)

        doc_term_matrix = vectorizer.fit_transform(terms_list)

        num_topics = int(len(sorted_result) / 10)

        model = textacy.tm.TopicModel('nmf', n_topics=num_topics)
        model.fit(doc_term_matrix)

        doc_topic_matrix = model.transform(doc_term_matrix)

        topic_cluster = {}
        for topic_idx, top_terms in model.top_topic_terms(
                vectorizer.id_to_term, topics=-1, top_n=8, weights=True):
            dct = dict(top_terms)
            tt_list = []
            for j in dct.keys():
                tt_list.append({"term": j, "weight": dct[j]})
            topic_cluster["topic-" + str(topic_idx)] = {"terms": tt_list}

        for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix,
                                                        topics=-1,
                                                        top_n=6,
                                                        weights=True):
            dct = dict(top_docs)
            tweet_in_topic_list = []
            for j in dct.keys():
                query_str = "tweet_id=" + corpus[j].metadata
                curr = sorted_result[sorted_result['tweet_id'] ==
                                     corpus[j].metadata]
                curr_frame_row = curr.iloc[0]
                is_attached_to_topic = False
                for prev_topic in topic_cluster:
                    if 'tweets' in topic_cluster[prev_topic]:
                        tweet_list = topic_cluster[prev_topic]['tweets']
                        for tweet in tweet_list:
                            if tweet['tweet_id'] == curr.iloc[0]['tweet_id']:
                                is_attached_to_topic = True
                                break

                if not is_attached_to_topic:
                    tweet_in_topic_list.append({
                        "tweet_id":
                        curr.iloc[0]['tweet_id'],
                        "tweet_text":
                        curr.iloc[0]['tweet_text'],
                        "user_score":
                        str(curr.iloc[0]['user_score']),
                        "raw_score":
                        str(curr.iloc[0]['raw_score'])
                    })
            if tweet_in_topic_list:
                topic_cluster["topic-" +
                              str(topic_idx)]['tweets'] = tweet_in_topic_list

        for curr_topic in topic_cluster:
            if 'tweets' in topic_cluster[curr_topic]:
                sent_weights = []
                for tweet in topic_cluster[curr_topic]['tweets']:
                    sent_weights = sent_weights + get_sent_weights(
                        tweet, topic_cluster[curr_topic]['terms'])
                sent_weights = sorted(sent_weights,
                                      key=lambda x: x['final_score'],
                                      reverse=True)
                top_sents = sent_weights[0:2]
                sorted_top_sents = sorted(sent_weights,
                                          key=lambda x: x['ent_score'],
                                          reverse=True)
                topic_title = ""
                topic_title_list = []
                for sent in sorted_top_sents:
                    if sent['structure_penalty'] < 50 and sent[
                            'word_score'] > 0:
                        topic_title_list.append(sent['text'].strip('\n'))
                topic_cluster[curr_topic]['title'] = topic_title_list

        result_dict = {}
        for k in topic_cluster.keys():
            if 'tweets' in topic_cluster[k]:
                result_dict[k] = topic_cluster[k]

        insert_at = datetime.datetime.now().timestamp()

        insert_values = [topic, category, insert_at, json.dumps(result_dict)]

        sql_query = "INSERT into sharelock.topic_clusters (topic, category, inserted_at, tweet_cluster) values (?, ?, ?, ?)"
        try:
            prepared = session.prepare(sql_query)
            session.execute(prepared, (insert_values))
        except Exception as e:
            print(e)
コード例 #27
0
import textacy
import textacy.datasets
# import numpy
# import spacy
#
# text = ('Since the so-called "statistical revolution" in the late 1980s and mid 1990s, '
#         'much Natural Language Processing research has relied heavily on machine learning. '
#         'Formerly, many language-processing tasks typically involved the direct hand coding '
#         'of rules, which is not in general robust to natural language variation. '
#         'The machine-learning paradigm calls instead for using statistical inference '
#         'to automatically learn such rules through the analysis of large corpora '
#         'of typical real-world examples.')
#
# textacy.text_utils.KWIC(text, 'example', window_width=35)
#
# print(textacy.preprocess_text(text, lowercase=True, no_punct=True)+"\n")
# # spacy.load('en')
# doc = textacy.Doc(text)

cw = textacy.datasets.CapitolWords()
cw.download()
records = cw.records(speaker_name={'Hillary Clinton', 'Barack Obama'})
text_stream, metadata_stream = textacy.fileio.split_record_fields(
    records, 'text')
corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)
print(corpus)
コード例 #28
0
    with left_column:
        pos = st.multiselect('Keep', default_pos, default=default_pos)

    with right_column:
        freq_cutoff = st.number_input('Minimum Token Freq',
                                      min_value=0,
                                      value=2)

        #color_option = st.radio('Wordcloud colour', ["Default", "Choose one"])

        #if (color_option == "Choose one"):
        #    colors = st.color_picker('Colour')

    en = textacy.load_spacy_lang("en_core_web_sm", disable=())
    matcher = get_matcher(en, pos)
    corpus = textacy.Corpus(en, data=texts)
    terms = get_terms_for_wordcloud(matcher, corpus)

    if (len(terms) > 0):
        wordcloud_terms = [dict(text=k, value=v) for k, v in terms.items()]
        return_obj = wordcloud.visualize(wordcloud_terms,
                                         tooltip_data_fields={
                                             'text': 'Term',
                                             'value': 'Frequency'
                                         },
                                         per_word_coloring=False)
        #print(clicked['clicked'])
        if (return_obj != None and 'clicked' in return_obj):
            if (return_obj['clicked'] != None):
                focus_word = return_obj['clicked']['text']
コード例 #29
0
        text = row[text_column].replace("\n", " ")

        # use all columns as metadata, except the column with the actual text
        metadata = row.to_dict()
        del metadata[text_column]

        yield (text, metadata)


# set correct path relative to working directory (folder where you saved this script)
f_csv = "materials/dataset_speeches_federal_council_2019.csv"

# stream the csv-dataset by calling the function defined above
texts = get_texts_from_csv(f_csv, text_column="Text")
# create a corpus with all the texts
corpus_speeches = textacy.Corpus(de, data=texts)

# task 5: two subcorpora
# define two functions filtering by language and period
# similar as the lambda functions shown in slides, yet may be better understandable


def filter_func_pre(doc):
    return doc._.meta.get("Sprache") == "de" and doc._.meta.get("Jahr") < 2000


# greater-equal to include the year 2000
def filter_func_post(doc):
    return doc._.meta.get("Sprache") == "de" and doc._.meta.get("Jahr") >= 2000

コード例 #30
0
ファイル: make_corpus.py プロジェクト: derpyninja/nlp4cciwr
def create_corpus(
    input_filepath,
    output_filepath,
    nlp=None,
    specific_stopwords=None,
    return_data=False,
):
    """
    Runs data processing scripts to turn raw data from (../raw) into
    cleaned data ready to be analyzed (saved in ../processed).

    Parameters
    ----------
    input_filepath : str
        Folder path storing un-mutable raw data. Use a wildcard within the
         file name to filter files via glob.glob.
    output_filepath : str
        File path where corpus should be saved.
    nlp : spaCy
        NLP pipeline
    specific_stopwords : iterable, None
        Case specific stopwords that are worth deleting before going into the
        spaCy pipeline to prevent memory allocation problems.
    save : bool
        True if corpus should be saved, else otherwise.
    return_data : bool
        whether to keep corpus in memory

    Returns
    -------
    corpus: textacy.Corpus
        Corpus created from BBC Monitoring data stored in binary format
        and zipped for optimal compression
    """
    logger = logging.getLogger(__name__)
    logger.info("Creating corpus from raw BBC Monitoring data")

    # load and configure spacy nlp model
    # https://stackoverflow.com/questions/52557058/spacy-nlp-pipeline-order-of-operations
    # -------------------------------------------------------------------------
    if nlp is None:
        nlp = en_core_web_lg.load()
        nlp.max_length = int(30 * 1e6)
        nlp.remove_pipe("parser")
        nlp.remove_pipe("ner")

    # compile list of documents (slower, but more robust than os.listdir)
    # -------------------------------------------------------------------------
    file_list = glob.glob(input_filepath)
    file_list = sorted(file_list)

    # iteratively read in text stream
    # -------------------------------------------------------------------------
    records = []
    for file_path in tqdm(file_list):

        # extract metadata of file name
        # ---------------------------------------------------------------------
        fname = file_path.split("/")[-1].split(".")[0].split("_")

        if len(fname) == 2:
            river_basin, year = fname
            month = np.nan
        elif len(fname) == 3:
            river_basin, year, month = fname
        else:
            raise NotImplementedError("Check needed!")

        metadata = {"basin": river_basin, "year": year, "month": month}

        # read and pre-process with nlp pipeline
        # ---------------------------------------------------------------------
        with open(file_path) as f_input:
            # 1) read raw text file
            text_raw = f_input.read()

            # 2) pre-process with utils (textacy only, or textacy & gensim)
            text = preprocess_text(
                text_raw,
                char_count_filter=True,
                stopwords=specific_stopwords,
                min_len=3,
                max_len=15,
            )

            # 3) create doc with metadata
            doc = textacy.make_spacy_doc(data=(text, metadata), lang=nlp)

            # 4) append record
            records.append(doc)

    # build corpus
    # ---------------------------------------------------------------------
    corpus = textacy.Corpus(nlp, data=records)
    corpus.save(output_filepath)

    # optionally keep corpus in memory
    if return_data:
        return corpus
    else:
        return None