Example #1
0
 def preprocess(self, docs):
     bigram = Phrases(docs, min_count=self._min_count)
     for idx in range(len(docs)):
         for token in bigram[docs[idx]]:
             if '_' in token:
                 # Token is a bigram, add to document.
                 docs[idx].append(token)
     # Create a dictionary representation of the documents.
     dictionary = Dictionary(docs)
     # Filter out words
     dictionary.filter_extremes(no_below=self._no_below,
                                no_above=self._no_above)
     # Bag-of-words representation of the documents.
     corpus = [dictionary.doc2bow(doc) for doc in docs]
     self._corpus = corpus
     self._dictionary = dictionary
     if self._verbose:
         print('Number of unique tokens: %d' % len(dictionary))
         print('Number of documents: %d' % len(corpus))
Example #2
0
def main(args):

    logger = logging.getLogger(__name__)
    logger.info('Preprocessing ' + args.input)

    results = []
    stopword_list = build_stop_words()
    df = pd.DataFrame([])
    docs = np.array([])

    with timer("Load & Clean"):
        with open(f"{args.dir}/{args.input}", 'r', newline='', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter=',')
            for row in reader:
                abstract_col = clean_str(row[8], stopword_list)
                # title, abstract, publish_time, authors, url
                results.append(
                    [row[3], abstract_col, row[9], row[10], row[17]])

        df = pd.DataFrame(results[1:], columns=results[0])

    with timer("Drop NA & Duplicates"):
        df = df.drop_duplicates(subset='abstract', keep='first')
        df = df.dropna(subset=["abstract"])

    with timer("Drop Non-English Papers"):
        df = check_language(df)

    with timer("Vectorize abstract column"):
        docs = np.array(list(sent_to_words(df.abstract)))

    with timer("Add bigrams to docs"):
        # Add bigrams to docs (only ones that appear 20 times or more).
        bigram = Phrases(docs, min_count=20)
        for idx in range(len(docs)):
            for token in bigram[docs[idx]]:
                if '_' in token:
                    # Token is a bigram, add to document.
                    docs[idx].append(token)

    with timer("Export results"):
        df.to_csv(f"{args.dir}/df_cleaned.csv", encoding='utf-8', index=False)
        np.save(f"{args.dir}/docs.npy", docs)
Example #3
0
def preprocess(sentences, spacy_model, stopwords, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    dataset = []

    for sentence in sentences:
        words = [token.lemma_ for token in spacy_model(sentence) if token.pos_ in allowed_postags]
        words = simple_preprocess(' '.join(words), deacc=True)
        words = [word for word in words if not word in stopwords]
        dataset.append(words)

    # Build the bigram and trigram models
    bigram = Phrases(dataset, min_count=5, threshold=100) # higher threshold fewer phrases.
    # trigram = Phrases(bigram[words], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = Phraser(bigram)
    # trigram_mod = Phraser(trigram)

    bigrams = [bigram_mod[sentence] for sentence in dataset]
    return bigrams
Example #4
0
def bigphrase_tfidf_feats(dataset):
    corpus = preprocessing_txt(dataset)
    lemmetized_sent = []
    for each_sent in nlp.pipe(corpus, batch_size=50, n_threads=-1):
        if each_sent.is_parsed:
            res = [
                tok.lemma_ for tok in each_sent if not tok.is_punct
                or tok.is_space or tok.is_stop or tok.like_num
            ]
            lemmetized_sent.append(res)
        else:
            lemmetized_sent.append(None)
    bigram = Phraser(Phrases(lemmetized_sent))
    bigram_lem = list(bigram[lemmetized_sent])
    parsed = []
    for k in range(0, len(bigram_lem)):
        joined = ' '.join(bigram_lem[k])
        parsed.append(joined)
    return parsed, bigram_lem
Example #5
0
def build_gensim_model(features,
                       num_features=100,
                       min_word_count=100,
                       context=5,
                       downsampling=1e-3,
                       verbose=True):
    """

    """
    from gensim.models import Phrases
    from gensim.models import word2vec
    import time
    import logging
    import multiprocessing

    start = time.time()

    if (verbose):
        # Lets make sure that we are logging—this will take a long time and its good to get updates
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

    # Transforming to bigram representation
    bigram_transformer = Phrases(features)

    if (verbose):
        print("Training model...")

    # Initialize and train the model
    model = word2vec.Word2Vec(bigram_transformer[features],
                              workers=multiprocessing.cpu_count(), \
                              size=num_features, \
                              min_count = min_word_count, \
                              window = context, \
                              sample = downsampling)

    # We don't plan on training the model any further, so calling
    # init_sims will make the model more memory efficient by normalizing the
    # vectors in-place.
    model.init_sims(replace=True)

    return (model)
def bow_corpus(original_corpus):
    docs = list(original_corpus)
    # Tokenize the documents.
    
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 2] for doc in docs]

    # Lemmatize the documents.

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
    
    # Compute bigrams.
    from gensim.models import Phrases

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    # Remove rare and common tokens.
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=2, no_above=0.5)
    print(docs[0])
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    return corpus, dictionary
Example #7
0
    def __init__(self,
                 num_topics=100,
                 min_word_count=20,
                 top_most_common_words=10,
                 min_doc_length=40,
                 max_doc_length=1000,
                 random_state=None):
        self.num_topics = num_topics
        self.min_word_count = min_word_count
        self.top_most_common_words = top_most_common_words

        assert max_doc_length > min_doc_length, \
               "max_doc_length must be greater than min_doc_length"
        self.min_doc_length = min_doc_length
        self.max_doc_length = max_doc_length
        self.random_state = random_state

        # natural language processing
        self.stop_words = self.getEnglishStopWords()
        self.bigramizer = Phrases()
Example #8
0
    def get_topic_extraction_glda(self, message, id):
        #         self.load_lda_topic_model()

        tf_vectorizer = CountVectorizer(max_df=1,
                                        min_df=1,
                                        vocabulary=self.glda_tf_feature_names)

        docs = []

        logger.propagate = False

        message = re.sub('\n', ' ', message)
        docs = self.message_corpus(message)

        print('Building BiGrams from the message...')
        bigram = Phrases(docs, min_count=2, threshold=2, delimiter=b' ')
        bigram_phraser = Phraser(bigram)

        texts = [bigram_phraser[line] for line in docs]

        bg_message = ' '.join(texts[0])

        tf = tf_vectorizer.fit_transform([bg_message])

        doc_topic = self.glda.transform(tf)

        self.config_dict = dict(self.config.items('TOPIC_LABEL'))

        list_topic_names = eval(self.config_dict['list_topic_names'])

        document_topics = [(list_topic_names[topicid], topicvalue)
                           for topicid, topicvalue in enumerate(doc_topic[0])
                           if topicvalue >= 0.01]

        document_topics = sorted(document_topics,
                                 key=lambda score: score[1],
                                 reverse=True)

        #         print(document_topics)

        return document_topics
Example #9
0
    def process_docs(self):
        for name in self.f_list:
            longName = (self.os_dir + name)

            if ".pdf" in name:
                if PDFPage.get_pages(longName, check_extractable=False):
                    try:
                        text = self.convert_pdf(longName)
                        text = self.preprocess(text)
                        self.corpus.append(text)
                        self.data.append(name)
                    except:
                        print("Unable to parse PDF file: " + longName)

            if ".docx" in name:
                text = docx.process(longName)
                text = self.preprocess(text)
                self.corpus.append(text)
                self.data.append(name)

            elif name.split('.')[-1] == 'doc':
                print(
                    "Detected .doc file: " + name +
                    ". Please convert to .docx or .pdf if you want this file to be included."
                )

        trigrams = Phrases(self.corpus,
                           min_count=1,
                           threshold=2,
                           delimiter=b' ')
        trigram_phraser = Phraser(trigrams)

        trigram_token = []
        for i in self.corpus:
            trigram_token.append(trigram_phraser[i])

        self.corpus = trigram_token
        for x, arr in enumerate(self.corpus):
            self.corpus[x] = np.array(self.corpus[x])
            self.corpus[x] = self.remove_exemptions(self.corpus[x])
            self.corpus[x] = self.corpus[x].tolist()
Example #10
0
def make_w2v(series, stopwords=[], size=200, window=5, min_count=5, workers=-1, 
						 epochs=20, lowercase=True, sg=0, seed=17, cbow_mean=1, alpha=0.025,
						 sample=0.001, use_bigrams=True, threshold=10, bigram_min=5):
	# turn the series into a list, lower it, clean it
		sentences = [sentence for sentence in series]
		if lowercase:
			cleaned = []
			for sentence in sentences:
				cleaned_sentence = [word.lower() for word in sentence]
				cleaned_sentence = [word for word in sentence if word not in stopwords]
				cleaned.append(cleaned_sentence)
		else:
			cleaned = []
			for sentence in sentences:
				cleaned_sentence = [word for word in sentence]
				cleaned_sentence = [word for word in sentence if word not in stopwords]
				cleaned.append(cleaned_sentence)

	# incorporate bigrams
		if use_bigrams:
			bigram = Phrases(cleaned, min_count=bigram_min, threshold=threshold, delimiter=b' ')
			bigram_phraser = Phraser(bigram)
			tokens_list = []
			for sent in cleaned:
				tokens_ = bigram_phraser[sent]
				tokens_list.append(tokens_)
			cleaned = tokens_list
		else:
			cleaned = cleaned

	# build the model
		model = Word2Vec(cleaned, size=size, window=window, 
										 min_count=min_count, workers=workers, seed=seed, sg=sg,
										 cbow_mean=cbow_mean, alpha=alpha, sample=sample)
		model.train(series, total_examples=model.corpus_count, epochs=epochs)
		model_wv = model.wv
		
	# clear it to avoid unwanted transference
		del model

		return model_wv
def write_to_file_chartssb(no_delexi_charts: List[str],
                           all_sents: List[List[str]]) -> None:
    with open(os.path.join('chartssb/original_data/', 'chartssb.box'),
              'w') as g:
        with open(os.path.join('chartssb/original_data/', 'train.box'),
                  'w') as train:
            with open(os.path.join('chartssb/original_data/', 'test.box'),
                      'w') as test:
                with open(os.path.join('chartssb/original_data/', 'valid.box'),
                          'w') as valid:

                    for chart in no_delexi_charts:

                        chart_descs, _ = turn_chart_info_into_sentences(chart)
                        #print(chart_descs)

                        bigram2 = Phrases(all_sents, min_count=1, threshold=2)
                        bigram2.add_vocab([["Financial", "Groups"],
                                           ["Law", "Firms"],
                                           ["Computer", "Science"]])
                        print("vocab=", bigram2.vocab)
                        chart_infos_sentb = turn_dict_into_sent_b(chart_descs)
                        new_infos = convert_chartssb_to_bigrams(
                            chart_infos_sentb, bigram2)
                        chart_lines_sentb = generate_files_sb(new_infos)
                        len_all_chart_sentences = len(chart_lines_sentb)
                        print("len=", len_all_chart_sentences)
                        g.write(''.join(chart_lines_sentb))

                        for line_idx, chart_line in enumerate(
                                chart_lines_sentb):
                            if line_idx in list(range(5)):
                                #print("test=", line_idx)
                                test.write(chart_line)
                            elif line_idx in list(range(5, 10)):
                                #print("valid=", line_idx)
                                valid.write(chart_line)
                            elif line_idx in list(
                                    range(10, len_all_chart_sentences)):
                                #print("train=", line_idx)
                                train.write(chart_line)
def train_word2vec_bigram(word_statements, name='word2vec_fa_model'):
    phrases = Phrases(word_statements, min_count=30, progress_per=10000)
    bigram = Phraser(phrases)
    sentences = bigram[word_statements]
    num_cores = multiprocessing.cpu_count()
    w2v_model = Word2Vec(min_count=20,
                         window=2,
                         size=300,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=num_cores - 1)
    w2v_model.build_vocab(sentences, progress_per=10000)
    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=30,
                    report_delay=1)
    w2v_model.save(name)
    w2v_model.init_sims(replace=True)
    return w2v_model
Example #13
0
def count_phrases(tokens_list):
    '''
    用 Bi-gram 來判斷連續詞(=短語),例如 new-york 或是 data-set
    儘管 sklearn 的 CountVectorizer 寫法比較直覺,效果也比較好,但會直接轉成向量
    因此為了保留原本的詞庫,方便之後可以分別跑 TF-IDF 和 Word2Vec 等不同路線時詞庫一致,因此使用 Phrases
     @param tokens_list: 斷詞後的詞串列,如  [['詞1-1','詞1-2'],['詞2-1','詞2-2']]
     @return: 銜接短語後的詞串列,如 [['詞1-1','詞1-2'],['詞2-1_詞2-2']]
     @note: 口試後已停用
    '''
    i = 0
    bigram_tokens = []
    bigram = Phrases(tokens_list, min_count=1, threshold=2)
    bigram_phraser = Phraser(bigram)
    for tokens in tokens_list:
        bigram_tokens.append(bigram_phraser[tokens])
        i += 1
        print('[%s] 正在篩選短語: %6d / %6d' % (t.now(), i, len(tokens_list)),
              end='\r')
    print('[%s] 短語已辨識完畢' % t.now())

    return bigram_tokens
def ngrams(input_docs):
    """
    Add bigrams (and possibly trigrams) to docs (only ones that appear 20 times or more).
    Uncomment trigram lines for trigram addition.

    :param input_docs: input docs file (gensim format)
    :return: docs file (list of lists) with appended ngrams
    """
    output_docs = input_docs
    bigram = Phrases(output_docs, min_count=20)
    # trigram = Phrases(bigram[output_docs], min_count=20)

    for idx in range(len(output_docs)):
        for bigram_ in bigram[output_docs[idx]]:
            if '' in bigram_:
                # Token is a bigram, add to document.
                output_docs[idx].append(bigram_)
                # for token in trigram[bigram[bigram_]]:
                #     if '' in token:
                #         output_docs[idx].append(token)
    return output_docs
Example #15
0
def lemmantizator():
    file = open('arquivao.txt', 'r')
    docs = file.readlines()
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    nltk.download('wordnet')
    from nltk.stem.wordnet import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    # Remove rare and common tokens.
    from gensim.corpora import Dictionary
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)
    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    print('Number of unique tokens: ', len(dictionary))
    print('Number of documents: ', len(corpus))

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=20)

    docs = [bigram[d] for d in docs]

    with open('bigrammed.txt', 'w') as f:
        for item in docs:
            f.write("%s" % item)
    f.close()
Example #16
0
def prep_text_lda(docs, vocab_size=20000):
    """ docs: (pd.Series str) cleaned text """

    english_stopwords = set([s.replace("\'", "") for s in stopwords.words("english")])
    tqdm.pandas(desc="Tokenizing")
    tokenized_docs = docs.progress_apply(lambda x: [w.lower() for w in tokenize(x)])

    bigram = Phrases(tokenized_docs.values.tolist())
    phraser = Phraser(bigram)
    tqdm.pandas(desc="Bigrams")
    bigrammed_docs = tokenized_docs.progress_apply(lambda tokens_: phraser[tokens_])

    id2word = Dictionary(bigrammed_docs.values.tolist())
    id2word.filter_extremes(keep_n=vocab_size, no_above=0.5)
    id2word.filter_tokens(bad_ids=[id2word.token2id[a] for a in english_stopwords if a in id2word.token2id])
    id2word.compactify()

    tqdm.pandas(desc="Cleaning")
    tokenized = bigrammed_docs.progress_apply(lambda doc_tokens: " ".join([w for w in doc_tokens if w in id2word.token2id]))
    reconst_docs = tokenized.apply(lambda x: x.split())

    return id2word, reconst_docs
Example #17
0
def convert_features(df):
    bigram_transformer = Phrases(common_texts)
    model = Word2Vec(bigram_transformer[common_texts], min_count=1)
    model.save("word2vec.model")

    for col in df.columns:
        num_unique = len(df[col].unique())
        if np.issubdtype(df[col].dtype, np.number):
            # numerical
            print(col, "[numerical", "#unique =", num_unique, "]")
            continue
        elif num_unique < 60 or (num_unique < 0.01 * df.shape[0]
                                 and num_unique < 100):
            # categorical
            print(col, "[categorical", "#unique =", num_unique, "]")
            df = convert_onehot(df, col)
        else:
            # text
            print(col, "[text", "#unique =", num_unique, "]")
            df = convert_word2vec(df, col)

    return df
Example #18
0
def phrs_model(sentences):
    '''
    Generate Phrases model to find potential phrases,
    save its phrases into csv file

    Input:
    sentences(list of list of words): sentences without stop words
    '''
    model_ph = Phrases(sentences)
    #model_ph.save(PHRS_MODEL_NAME)
    gensim_phrs = model_ph.export_phrases(sentences)
    gensim_phrs = list(set(gensim_phrs))
    gensim_phrs = [g[0].decode("utf-8") for g in gensim_phrs \
                                    if g[0].split()[0]!=g[0].split()[1]]

    with open(PHRS_OUTFILE, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        sent = set()
        for i in gensim_phrs:
            if i not in sent:
                writer.writerow([i])
                sent |= {i}
Example #19
0
def train_phrases(paths,
                  out='data/bigram_model.phrases',
                  tokenizer=word_tokenize,
                  **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    kwargs = {'max_vocab_size': 40000000, 'threshold': 8.}.update(kwargs)

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize),
                     **kwargs)

    print('Saving...')
    bigram.save(out)
Example #20
0
def read_data(config):
    """Reads data from provided csv file of with processed doc text"""
    data = config['csv_filenames'][0]
    docwords = []
    file_rownames = []

    with open(data, 'r') as f:
        csv_text = csv.reader(f,
                              delimiter='\t',
                              quoting=csv.QUOTE_NONE,
                              quotechar="",
                              escapechar="\\")
        csv.field_size_limit(sys.maxsize)
        row_num = 0
        for row in csv_text:
            tokens = []
            # Reformat the incoming text
            text = row[-1][2:-2].replace("'", "").split(",")
            for token in text:
                token = token.strip()
                if token != ' ':
                    tokens.append(token)
            # topicvecDir.py needs this nested list format
            # in order to run correctly.
            tokens = [tokens]
            file_rownames.append(data + str(row_num))
            docwords.append(tokens)
            row_num += 1

    # Add bigrams
    for outer_list in docwords:
        bigram = Phrases(outer_list)
        for i in range(len(outer_list)):
            for token in bigram[outer_list[i]]:
                if '_' in token:
                    outer_list[i].append(token)

    return docwords, file_rownames
    def __init__(self, fromdate, todate):
        self.fromdate = fromdate
        self.todate = todate
        print('Start reading sentences')
        documents = [
            line.strip() for line in open(PATH + FILENAME).readlines()
            if len(line) > 1 and len(line) < 200
        ]
        sentences = [
            " ".join([w for w in sentence.split() if w not in stopWords])
            for sentence in documents
        ]
        print("start tokenization...")
        #self.corpus =  [nltk.word_tokenize(sentence) for sentence in self.sentences]
        self.corpus = [x.split(" ") for x in sentences]

        #print('Start tokenization')
        #  self.corpus = [nltk.word_tokenize(sentence) for sentence in self.sentences]
        #print("CORPUS", self.corpus)
        print('Start phrases')
        self.phrases = Phrases(sentences=self.corpus,
                               min_count=25,
                               threshold=50)
        self.bigram = Phraser(self.phrases)
        # for sent in self.bigram[self.sentences]:  # apply model to text corpus
        #    pass

        for index, sentence in enumerate(self.corpus):
            self.corpus[index] = self.bigram[sentence]

        self.model = gensim.models.Word2Vec(**W2V_PARAMETERS)
        self.model.build_vocab(self.corpus)

        print('Build Word2Vec vocabulary')
        self.model.train(self.corpus,
                         total_examples=self.model.corpus_count,
                         epochs=self.model.iter)
        print('Estimated Word2Vec model')
Example #22
0
def word2vec_measure():
    article_names = ["expressen", "aftonbladet", "svd", "dn"]  #,
    sentences = []

    for single_article in article_names:

        print(" \n *** " + single_article + " *****")
        articles = db.get_articles(single_article)
        bigram = Phrases()

        for row in articles:
            row = IO.filter_text(row.lower())
            sentence = [
                word for word in row if word not in stopwords.words('swedish')
            ]

            sentences.append(sentence)
            bigram.add_vocab([sentence])

    print(len(sentences))

    num_features = 300  # Word vector dimensionality
    min_word_count = 5  # Minimum word count
    num_workers = 8  # Number of threads to run in parallel
    context = 5  # `context window` is the maximum distance between the current and predicted word within a sentence.
    downsampling = 1e-3  # Downsample setting for frequent words

    # bigram_model = Word2Vec(bigram[sentences], size=100)
    bigram_model = Word2Vec(bigram[sentences], workers=num_workers, \
            size=num_features, sg=1, min_count = min_word_count, \
            window = context, sample = downsampling)

    word2vec_result = bigram_model.most_similar(
        positive=['muslimska_brödraskapet'], topn=200)
    # filepath = prop.word2vec_count+single_article+".tsv"
    filepath = prop.word2vec_count + "all_10.tsv"

    IO.write_tuple(word2vec_result, filepath)
def generateTokens(data, data_Full, n_dim, myStopWords):
    phrases = Phrases(data)
    biggram = Phraser(phrases)
    """
    #---------Check multiple token for a word
    biggram = Phraser(phrases)
    biggram[reviews[0]]
    
    """
    #lstReviews = list(train_Dataset.Reviews.apply(lambda x: get_bigrams(x)))
    modelW2V = Word2Vec(biggram[data],
                        size=n_dim,
                        min_count=minFreq,
                        window=2,
                        sg=1)
    """
    Term document frequency for weighted average of features
    """

    tfVectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=myStopWords)
    nmf = TruncatedSVD(n_components=n_dim)
    tfFeatureSet = nmf.fit_transform(tfVectorizer.fit_transform(data_Full))

    countVectorizer = CountVectorizer(ngram_range=(1, 2),
                                      stop_words=myStopWords,
                                      max_df=.8)
    nmf = TruncatedSVD(n_components=n_dim)
    cvFeatureSet = nmf.fit_transform(countVectorizer.fit_transform(data_Full))

    tfFeatureSet = mergeFeatureSet(tfFeatureSet, cvFeatureSet)
    #tfFeatureSet = tfVectorizer.transform(data_Full)

    tfidf = dict(
        zip(
            list(
                map(lambda x: str.replace(x, " ", "_"),
                    tfVectorizer.get_feature_names())), tfVectorizer.idf_))
    return modelW2V, tfidf, tfFeatureSet
Example #24
0
def preprocess_doc(doc):
    '''Preprocess a document for training.'''

    # Remove some stuff before tokenization.

    # Remove email addresses.
    doc = re.sub(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '', doc)

    # Tokenize the document.

    # Split the document into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    doc = doc.lower()  # Convert to lowercase.
    doc = tokenizer.tokenize(doc) # Split into words.

    # Remove numbers, but not words that contain numbers.
    doc = [token for token in doc if token.isalpha()]

    # Remove words that are only one character.
    doc = [token for token in doc if len(token) > 1]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    stop_words = stop_words.union(['one', 'ax', 'max'])
    doc = [token for token in doc if not token in stop_words]

    # Lemmatize the documents.
    lemmatizer = WordNetLemmatizer()
    doc = [lemmatizer.lemmatize(token) for token in doc]

    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(doc, min_count=20)
    for token in bigram[doc]:
        if '_' in token:
            # Token is a bigram, add to document.
            doc.append(token)

    return doc
    def get_bigrams_from_preprocessed(self,
                                      min_count=0.1,
                                      threshold=10.,
                                      scoring='default'):
        """
        Computes bigrams after preprocessing. NOTE: overwrites preprocessed_text_ attribute.

        ------PARAMETERS------
        min_count: minimum count of bigrams to be included
        threshold: scoring threshold  for bigrams for inclusion
        scoring: gensim Phrases scoring function to evaluate bigrams for threshold
		"""
        x = Phrases(self.preprocessed_text_,
                    min_count=min_count,
                    threshold=threshold,
                    scoring=scoring)
        x = Phraser(x)

        bigram_token = []
        for sent in self.preprocessed_text_:
            bigram_token.append(x[sent])

        self.preprocessed_text_ = bigram_token
Example #26
0
    def word2vec_train(self, model_f_name):
        _bigram = 0
        import gensim, logging
        if (_bigram):
            bigram = Phrases(RW.texts_ko)
            self.model = word2vec.Word2Vec(bigram[self.texts_ko],
                                           **self.config)
        else:
            self.model = word2vec.Word2Vec(self.texts_ko, **self.config)

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        """
        outv = KeyedVectors()
        outv.vocab = self.model.wv.vocab
        outv.index2word = self.model.wv.index2word
        outv.syn0 = self.model.syn1neg
        """
        #inout_sim = outv.most_similar('navi')
        #print (inout_sim)
        #fname = str(self.config['size'])+'_'+str(self.config['window'])+ '_'+model_f_name
        self.model.save(model_f_name)  #test
        self.model.init_sims(replace=True)
    def __init__(self, df):
        df = df[df['Type'] == 'Article']
        word2vecSamples = list(df['Abstract'])

        stop_words = set(stopwords.words('english'))

        t0 = time()
        data = []
        for i in word2vecSamples:
            temp = []
            for j in word_tokenize(i):
                if j.lower() not in stop_words:
                    #             if j == 'amino':
                    #                 print(j)
                    temp.append(j.lower().translate(
                        str.maketrans('', '', string.punctuation)))

            data.append(temp)

        self.data = data
        self.bigram_transformer = Phrases(data)

        print("done in %0.3fs." % (time() - t0))
Example #28
0
    def bigramGenerator(self):
        corpusStream = self.sentenceStream()
        phrases = Phrases(corpusStream,
                          min_count=self.bigramMinCount,
                          threshold=self.thresholdBigram)
        bigram = Phraser(phrases)

        inputStream = self.sentenceStream()
        bigramSentenceList = (bigram[sentence] for sentence in inputStream)

        bigramList = set()
        for bigramSentence in bigramSentenceList:
            for item in bigramSentence:
                if "_" in item:
                    bigramList.add(item)

        print("Number of Unique Bigrams = ", len(bigramList))
        for item in sorted(bigramList):
            if not os.path.exists(self.trainingLocation):
                os.makedirs(self.trainingLocation)
            with open(os.path.join(self.trainingLocation, "TC-phrases-bi.txt"),
                      "a") as outFile:
                outFile.write(item + "\n")
Example #29
0
def build_word_vec(show_log=True):

    section, year = volume.split(".")
    texts_path = "../arxiv/{0}/{1}/".format(section, year)

    files_list = shared.random_glob(texts_path, n_proc_articles)
    sentences = prepare_sentences(files_list, n_proc_articles)

    if show_log:
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)

    if config.biGram:
        bigram_transformer = Phrases(sentences, min_count=10)
        sentences = list(bigram_transformer[sentences])

    sentences = shared.plural_filter(sentences)

    return Word2Vec(sentences,
                    min_count=min_count,
                    size=size,
                    window=window,
                    workers=4)
Example #30
0
def bigrams_with_gensim(data):
    from gensim.models import Phrases
    bigram = Phrases()
    sentences = []
    for row in data:
        title = row['Headings'].replace('[','').replace(']','').replace("'",'')
        title = title + '.'
        #title = title.replace('--',' -- ')
        sentence = [word for word in nltk.word_tokenize(title.lower())
                    if word not in string.punctuation]
        sentences.append(sentence)
        bigram.add_vocab([sentence])
    bigram_counter = Counter()
    for key in bigram.vocab.keys():
        if key not in stopwords.words("english"):
            spl = re.split(b'\_',key)
            spl = [s for s in spl if s !='']
            if len(spl) > 1:
                bigram_counter[key] += bigram.vocab[key]
    print('Bigrams with gensim')
    for key, counts in bigram_counter.most_common(50):
        print('{}: {}'.format(key, counts))
    return bigram