def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        latin1 = partial(utils.to_unicode, encoding='latin1')
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [
                preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1])
                for s in f
            ]
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [
                preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1])
                for s in f
            ]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)]
Esempio n. 2
0
def parse_df(df):
    df['parsed series'] = preprocess_documents(df['series'].astype('str'))
    df['parsed reviews'] = preprocess_documents(df['reviews'].astype('str'))
    df['parsed blurb'] = preprocess_documents(df['blurb'].astype('str'))
    df['parsed'] = df['parsed reviews'] + df['parsed blurb'] + df[
        'parsed series']
    df = df.sort_values(by=['nratings'],
                        ascending=False).reset_index(drop=True)
    return df
Esempio n. 3
0
def clean_text_by_sentences(text, stemming):
    """ Tokenizes a given text into all_sentences, applying filters and lemmatizing them.
    Returns a SyntacticUnit list. """
    original_sentences = split_sentences(text)
    if stemming:
        filtered_sentences = [join_words(sentence) for sentence in gensim_preprocessing.preprocess_documents(original_sentences)]
    else:
        nostem_filters = [f for f in gensim_preprocessing.DEFAULT_FILTERS if f != gensim_preprocessing.stem_text]
        filtered_sentences = [join_words(sentence) for sentence in gensim_preprocessing.preprocess_documents(original_sentences,
            filters=nostem_filters)]

    return merge_syntactic_units(original_sentences, filtered_sentences)
Esempio n. 4
0
def processTrainingData(list_of_tweets):
    list_of_tweets = [prep.strip_short(line) for line in list_of_tweets
                      ]  # removes short words with less than 3 characters
    list_of_tweets = prep.preprocess_documents(
        list_of_tweets)  # removes punctuation, numbers, whitespace,

    return list_of_tweets  # each tweet is returned as a list
Esempio n. 5
0
def get_response(utterance,genre_model):
	#preprocess utterence to remove unwanted characters and convert to lower case
	pre = utterance.lower().replace("-","").replace("?","").replace("'","").split()
	preprocess=preprocess_documents(pre)
	preprocessed=[str(i) for i in preprocess]
	#get doc2vec representation for utterance with respect to genre classifier model
	utterance_genre_vector = genre_model.infer_vector(preprocessed)
	#get predicted document(genre)
	sims = genre_model.docvecs.most_similar([utterance_genre_vector])
	matched_genre=sims[0][0]
	#load in document sentence classification model and predict target sentence
	doc_model = Doc2Vec.load(model_dir+matched_genre+".model")
	#get doc2vec representation for utterance with respect to  sentence classification model
	utterance_doc_vector = doc_model.infer_vector(preprocessed)
	#get predicted sentence
	sims = doc_model.docvecs.most_similar([utterance_doc_vector]) 
	matched_sentence_index=sims[0][0]
	#get corresponding response to that sentence and return it
	f=open(data_dir+matched_genre)
	lines=f.readlines()
	response=""
	if (matched_sentence_index+1<len(lines)):
		response= lines[matched_sentence_index+1]
	else:
		response=lines[matched_sentence_index]
	return response.replace("-","")
def loadRedditData():
    # Loads all saved Reddit posts
    df = pd.read_csv("data/reddit_todayilearned.csv")
    # Selects only the following columns
    df = df[[
        "id", "author", "domain", "url", "num_comments", "score", "title",
        "retrieved_on", "over_18", "permalink", "created_utc",
        "link_flair_text"
    ]]
    # Leaves only the non-adult content
    df = df[~df["over_18"]]
    # Removes documents with lower than 10 score
    df = df[df["score"] > 10]
    # Resets the index
    df.reset_index(inplace=True, drop=True)
    # Creates a list of documents
    documents = df["title"].tolist()
    # Preprocesses the documents
    texts = preprocess_documents(documents)
    # Creates the dictionary
    dictionary = corpora.Dictionary(texts)
    # Creates the corpus using bag-of-words
    corpus = [dictionary.doc2bow(text) for text in texts]
    # Generates the TF-IDF model
    tfidf = models.TfidfModel(corpus)
    # Creates the TF-IDF corpus
    corpus_tfidf = tfidf[corpus]
    # Fits an LSI model (with 100 topics)
    model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=15)
    # Identifies topics for each document
    corpus_wrapper = model[corpus_tfidf]
    # Creates the similarity index
    index = similarities.MatrixSimilarity(corpus_wrapper)
    return corpus_wrapper, index, df
Esempio n. 7
0
def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    >>> from gensim.summarization.textcleaner import clean_text_by_word
    >>> clean_text_by_word("God helps those who help themselves")
    {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
    'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
    'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}
Esempio n. 8
0
def clean_text_by_word(text, deacc=True):
    """Tokenize a given text into words, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.
    deacc : bool, optional
        Remove accentuation if True.

    Returns
    -------
    dict
        Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values.

    Example
    -------
    .. sourcecode:: pycon

        >>> from gensim.summarization.textcleaner import clean_text_by_word
        >>> clean_text_by_word("God helps those who help themselves")
        {'god': Original unit: 'god' *-*-*-* Processed unit: 'god',
        'help': Original unit: 'help' *-*-*-* Processed unit: 'help',
        'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'}

    """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}
Esempio n. 9
0
def clean_text_by_sentences(text):
    """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
    Returns a SyntacticUnit list. """
    original_sentences = split_sentences(text)
    filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]

    return merge_syntactic_units(original_sentences, filtered_sentences)
Esempio n. 10
0
def clean_text_by_sentences(text):
    """ Tokenizes a given text into sentences, applying filters and lemmatizing them.
    Returns a SyntacticUnit list. """
    original_sentences = split_sentences(text)
    filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]

    return merge_syntactic_units(original_sentences, filtered_sentences)
Esempio n. 11
0
def clean_text_by_sentences(text):
    original_sentences = split_sentences(text)
    filtered_sentences = [
        join_words(sentence)
        for sentence in preprocess_documents(original_sentences)
    ]
    return merge_syntactic_units(original_sentences, filtered_sentences)
Esempio n. 12
0
    def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        with open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(f)
        with open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(f)

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
Esempio n. 13
0
    def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        with open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(f)
        with open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(f)

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
Esempio n. 14
0
 def __iter__(self):
     for idx, doc in enumerate(self.doc_list):
         words = doc.lower().replace("-",
                                     "").replace("?",
                                                 "").replace("'",
                                                             "").split()
         #remove stop words , punctuation and numbers
         preprocess = preprocess_documents(words)
         #convert unicode to ascii string
         preprocessed = [str(i) for i in preprocess]
         yield LabeledSentence(preprocessed, tags=[self.labels_list[idx]])
Esempio n. 15
0
def clean_text_by_word(text, deacc=True):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return {unit.text: unit for unit in units}
Esempio n. 16
0
def clean_text_by_word(text):
    """ Tokenizes a given text into words, applying filters and lemmatizing them.
    Returns a dict of word -> syntacticUnit. """
    text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS])
    original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=True))
    filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)]
    if HAS_PATTERN:
        tags = tag(join_words(original_words))  # tag needs the context of the words in the text
    else:
        tags = None
    units = merge_syntactic_units(original_words, filtered_words, tags)
    return dict((unit.text, unit) for unit in units)
Esempio n. 17
0
 def __build_document(self):
     """ 
     Open up the pre-processed document, read it in, use the gensim preprocess_document function to process it (see gensim for documentation).
     Then build a vocabulary based on the processed documents.
     """
     with open('consolidated_nyt.tsv') as r:
         self.documents = r.read().splitlines()
     self.documents = preprocess_documents(self.documents)
     self.number_of_documents = len(self.documents)
     self.vocabulary = corpora.Dictionary(self.documents)
     self.vocabulary_size = len(self.vocabulary)
     print("Number of documents:" + str(len(self.documents)))
     print("Vocabulary size:" + str(self.vocabulary_size))
Esempio n. 18
0
    def setUp(self):
        """setup lee test corpora"""
        global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2

        pre_path = os.path.join(os.path.dirname(__file__), 'test_data')
        bg_corpus_file = 'lee_background.cor'
        corpus_file = 'lee.cor'
        sim_file = 'similarities0-1.txt'

        # read in the corpora
        latin1 = lambda line: utils.to_unicode(line, encoding='latin1')
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus = preprocess_documents(latin1(line) for line in f)
        with utils.smart_open(os.path.join(pre_path, bg_corpus_file)) as f:
            bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]
        with utils.smart_open(os.path.join(pre_path, corpus_file)) as f:
            corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f]

        # read the human similarity data
        sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file))
        sim_m_size = np.shape(sim_matrix)[0]
        human_sim_vector = sim_matrix[matutils.triu_indices(sim_m_size, 1)]
Esempio n. 19
0
def prepare_corpus(documents):
    tic = time()
    # lower, strip tags, strip punctuation, strip multiple whitespaces,
    # strip numeric, remove stopwords, strip short, stem text
    texts = preprocessing.preprocess_documents(documents)

    #filter out hapax legomena
    all_tokens = sum(texts, [])
    tokens_once = set(word for word in set(all_tokens)
                      if all_tokens.count(word) == 1)
    texts = [[word for word in text if word not in tokens_once]
             for text in texts]
    print(time() - tic)

    dictionary = corpora.Dictionary(texts)
    dictionary.save('texts/' + prefix + '/dictionary.dict')
    raw_corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('texts/' + prefix + "/corpus.mm", raw_corpus)
Esempio n. 20
0
def clean_text_by_sentences(text):
    """Tokenize a given text into sentences, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.

    Returns
    -------
    list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Sentences of the given text.

    """
    original_sentences = split_sentences(text)
    filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]

    return merge_syntactic_units(original_sentences, filtered_sentences)
Esempio n. 21
0
def clean_text_by_sentences(text):
    """Tokenize a given text into sentences, applying filters and lemmatize them.

    Parameters
    ----------
    text : str
        Given text.

    Returns
    -------
    list of :class:`~gensim.summarization.syntactic_unit.SyntacticUnit`
        Sentences of the given text.

    """
    original_sentences = split_sentences(text)
    filtered_sentences = [join_words(sentence) for sentence in preprocess_documents(original_sentences)]

    return merge_syntactic_units(original_sentences, filtered_sentences)
Esempio n. 22
0
def load_corpus(filename, d=3):
    import csv, sys

    # Increase max line length for csv.reader:
    max_int = sys.maxsize
    decrement = True
    while decrement:
        decrement = False
        try:
            csv.field_size_limit(max_int)
        except OverflowError:
            max_int = int(max_int / 10)
            decrement = True

    docs = []
    labs = []
    labelmap = dict()
    pat = re.compile("[A-Z]\d{2}")
    f = open(filename, 'r')
    reader = csv.reader(f)
    for row in reader:
        doc = row[1]
        lab = row[2]
        if len(lab) > 3:
            lab = lab.split(" ")
            lab = list(filter(lambda i: pat.search(i), lab))
            lab = [partition_label(x, d) for x in lab]
            lab = [item for sublist in lab for item in sublist]
            lab = list(set(lab))
            for x in lab:
                labelmap[x] = 1
        else:
            lab = partition_label(lab, d)
            for x in lab:
                labelmap[x] = 1
                # lab = [lab]
        docs.append(doc)
        labs.append(lab)
    f.close()
    print("Stemming documents ....")
    docs = gensimm.preprocess_documents(docs)
    return docs, labs, list(labelmap.keys())
Esempio n. 23
0
logging.info('loading word mapping')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('loading corpus')
corpus_bow = MmCorpus(working_corpus + '_bow.mm')

logging.info("create log_ent model and save it to disk")
tfidf = LogEntropyModel(corpus_bow,
                        id2word=dictionary.id2token,
                        normalize=True)
tfidf.save(result_path + corpus_name + log_ent_extension)

logging.info('load smal lee corpus and preprocess')
raw_lee_texts = utils.get_txt(lee_corpus)
preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts)
bow_lee_texts = [
    dictionary.doc2bow(text, allowUpdate=False, returnMissingWords=False)
    for text in preproc_lee_texts
]

logging.info('initialize LSI model')
lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics)
lsi.save((result_path + corpus_name + '_%i_ent' + lsi_extension) % num_topics)
logging.info('transforming small lee corpus (LSI)')
corpus_lsi = lsi[tfidf[bow_lee_texts]]

# # compute pairwise similarity matrix of transformed corpus
sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
for i, par1 in enumerate(corpus_lsi):
    for j, par2 in enumerate(corpus_lsi):
Esempio n. 24
0

logging.info('loading word mapping')
id2word, word2id = utils.loadDictionary(working_corpus + word_ids_extension)
dictionary = Dictionary(word2id=word2id, id2word=id2word)

logging.info('loading corpus')
corpus_bow = MmCorpus(working_corpus + '_bow.mm')

logging.info("create log_ent model and save it to disk")
tfidf = LogEntropyModel(corpus_bow, id2word=dictionary.id2token, normalize = True)
tfidf.save(result_path + corpus_name + log_ent_extension)

logging.info('load smal lee corpus and preprocess')
raw_lee_texts = utils.get_txt(lee_corpus)
preproc_lee_texts = preprocessing.preprocess_documents(raw_lee_texts)
bow_lee_texts = [dictionary.doc2bow(text,
                                    allowUpdate=False,
                                    returnMissingWords=False)
                for text in preproc_lee_texts]

logging.info('initialize LSI model')
lsi = models.LsiModel(tfidf[corpus_bow], id2word=id2word, numTopics=num_topics)
lsi.save((result_path + corpus_name + '_%i_ent'  + lsi_extension) % num_topics)
logging.info('transforming small lee corpus (LSI)')
corpus_lsi = lsi[tfidf[bow_lee_texts]]

# # compute pairwise similarity matrix of transformed corpus
sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
for i, par1 in enumerate(corpus_lsi):
    for j, par2 in enumerate(corpus_lsi):
Esempio n. 25
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    working_corpus = path.join(base_path, p['corpus_path'], p['corpus_name'])
    human_data_file = path.join(base_path, p['human_data_file'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(base_path,
                                           p['corpus_path'],
                                           p['dict_name']))
    Dictionary.save(dictionary, path.join(output_dir, p['dict_name']))
    logger.info(dictionary)

    logger.info('loading corpus')
    corpus_bow = MmCorpus(working_corpus)

    logger.info("create preprocessing model and save it to disk")
    if p['pre_model'] == 'tfidf':
        pre_model = TfidfModel(corpus_bow, id2word=dictionary, normalize=True)
    elif p['pre_model'] == 'log_ent':
        pre_model = LogEntropyModel(corpus_bow,
                                    id2word=dictionary, normalize=True)
    else:
        raise ValueError('model parameter %s not known' % p['pre_model'])
    pre_model.save(os.path.join(output_dir, p['pre_model_extension']))

    logger.info('initialize LSI model')
    lsi = models.LsiModel(pre_model[corpus_bow],
                          id2word=dictionary, num_topics=p['num_topics'])
    lsi.save(os.path.join(output_dir, p['lsi_extension']))
    logger.info('finished --> lsi model saved to: %s' %
                os.path.join(output_dir, p['lsi_extension']))

    # check for correlation with lee human data
    logger.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (LSI)')
    corpus_lsi = lsi[pre_model[bow_lee_texts]]

    # # compute pairwise similarity matrix of transformed corpus
    sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
    for i, par1 in enumerate(corpus_lsi):
        for j, par2 in enumerate(corpus_lsi):
            sim_matrix[i, j] = matutils.cossim(par1, par2)
    sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(human_data_file)
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    # compute correlations
    cor = np.corrcoef(sim_vector, human_sim_vector)
    logger.info("correlation with lee human data: %f" % cor[0, 1])

    dif = start - datetime.now()
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
import gensim
import pandas as pd
from gensim.parsing.preprocessing import preprocess_documents
from multiprocessing import Pool
from functools import partial
import math
import numpy as np

# use the newsgroup data as corpus
df = pd.read_json(
    "https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json"
)
documents = df.content.tolist()
documents = preprocess_documents(documents)

# fit an LDA model, n_topic = 5
news_dictionary = Dictionary(documents)
news_dictionary.filter_extremes(no_below=5,
                                no_above=0.5,
                                keep_n=5000,
                                keep_tokens=None)
corpus = [news_dictionary.doc2bow(text) for text in documents]
lda = gensim.models.LdaModel(corpus, num_topics=5, id2word=news_dictionary)

lda.show_topics()

# convert gensim corpus to a sparse document-term matrix for coherence measure
corpus_dense = gensim.matutils.corpus2csc(corpus,
                                          num_terms=len(
Esempio n. 27
0
model = gensim.models.Doc2Vec(
    size=200, min_count=1, alpha=0.025, min_alpha=0.001, dm_concat=1
)  #,window=7)#,train_words=True,learn_doctags =True,learn_words =True)
model.build_vocab(it)
model.train(it, total_examples=model.corpus_count, epochs=50)  #best is 48
model.save(model_dir + "genre.model")

#evaluate model accuracy over entire documents
true_pred = 0
for x in documents:
    with open(data_dir + x, 'r') as f:
        utterance = f.read()
        preprocessed = utterance.lower().replace("-",
                                                 "").replace("?", "").replace(
                                                     "'", "").split()
        preprocess = preprocess_documents(preprocessed)
        preprocessed = [str(i) for i in preprocess]
        utterance_genre_vector = model.infer_vector(preprocessed)
        sims = model.docvecs.most_similar([utterance_genre_vector])
        if (sims[0][0] == x):
            true_pred = true_pred + 1
accuracy = true_pred / len(documents)
print("model accuracy over entire document is: " + str(accuracy))
#evaluate model accuracy over entire documents
true_pred = 0
total_sentences = 0
for x in documents:
    with open(data_dir + x, 'r') as f:
        document = f.read().splitlines()
        for line in document:
            total_sentences = total_sentences + 1
def generate(db_uri, min_contexts=4, preprocess=False):
    """ Generate a list of citation contexts, given criteria:
            min_contexts
            preprocess  (preprocess_documents default; if off only punctuation
                         and multiple whitespaces are removed)
    """

    Base = declarative_base()

    engine = create_engine(db_uri)
    Base.metadata.create_all(engine)
    Base.metadata.bind = engine
    DBSession = sessionmaker(bind=engine)
    session = DBSession()

    CitContext = Table('papercitationcontexts',
                       Base.metadata,
                       autoload=True,
                       autoload_with=engine)

    print('querying DB')
    non_unique = session.query(CitContext.columns.paperreferenceid).\
         group_by(CitContext.columns.paperreferenceid).\
         having(func.count(CitContext.columns.paperreferenceid)
                 > min_contexts-1).\
         subquery()

    cit_contexts_db = session.query(CitContext).\
        filter(CitContext.columns.paperreferenceid.in_(non_unique)).\
        all()  # order_by(BibitemArxivIDMap.arxiv_id.desc()).all()
    print(len(cit_contexts_db))
    # 187595127
    print(dir(cit_contexts_db[0]))
    # ['__add__', '__class__', '__contains__', '__delattr__', '__dir__',
    #  '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__',
    #  '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__',
    #  '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__',
    #  '__module__', '__mul__', '__ne__', '__new__', '__reduce__',
    #  '__reduce_ex__', '__repr__', '__rmul__', '__setattr__', '__sizeof__',
    #  '__slots__', '__str__', '__subclasshook__', '_asdict', '_fields',
    #  '_real_fields', 'citationcontext', 'count', 'index', 'keys', 'paperid',
    #  'paperreferenceid']
    #
    # TODO
    # Work in progress. ↑ MAG DB results ↓ stuff to adjust for generating
    #                                      a dataset
    #
    sys.exit()

    print('merging bibitems')
    cited_docs_pre = {}
    uuid_aid_map = {}
    for bibitem in bibitems:
        if global_ids == 'mag':
            aid = bibitem.BibitemMAGIDMap.mag_id
        else:
            aid = bibitem.BibitemArxivIDMap.arxiv_id
        uuid = bibitem.Bibitem.uuid
        uuid_aid_map[uuid] = aid
        in_doc = bibitem.Bibitem.in_doc
        if aid not in cited_docs_pre:
            cited_docs_pre[aid] = {}
        if in_doc not in cited_docs_pre[aid]:
            cited_docs_pre[aid][in_doc] = []
        cited_docs_pre[aid][in_doc].append(uuid)
    print('checking merging results')
    cited_docs = {}
    for aid, doc_dict in cited_docs_pre.items():
        # for evaluation we *need* at least 2 documents containing citation
        # contexts (in order to perform a per doc test/train split)
        if len(doc_dict) > 1:
            cited_docs[aid] = []
            for in_doc, uuid_list in doc_dict.items():
                cited_docs[aid].append({
                    'uuid': uuid_list[0],  # uuid_list should always be len. 1
                    'in_doc': in_doc
                })
    print('going through docs')
    contexts = []
    for aid, doc_list in cited_docs.items():
        tmp_list = []
        num_docs = 0
        for doc in doc_list:
            in_doc = doc['in_doc']
            fn = '{}.txt'.format(in_doc)
            text_file = os.path.join(in_dir, fn)
            with open(text_file) as f:
                text = f.read()
            marker = '{{{{cite:{}}}}}'.format(doc['uuid'])
            marker_found = False
            for m in re.finditer(marker, text):
                margin = int(context_size / 2)
                idx = m.start()
                edx = m.end()
                pre = text[:idx]
                post = text[edx:]
                adj_pre = find_adjacent_citations(pre,
                                                  uuid_aid_map,
                                                  backwards=True)
                adj_post = find_adjacent_citations(post, uuid_aid_map)
                adjacent_citations = adj_pre + adj_post
                pre = re.sub(CITE_PATT, '', pre)
                post = re.sub(CITE_PATT, '', post)
                # heuristic pre-cutting (10 times average word length)
                pre = pre[-margin * 6 * 10:]
                post = post[:margin * 6 * 10]
                if preprocess:
                    pre, post = preprocess_documents([pre, post])
                else:
                    custom_filter = [
                        strip_punctuation, strip_multiple_whitespaces
                    ]
                    pre = preprocess_string(pre, custom_filter)
                    post = preprocess_string(post, custom_filter)
                placeholder = ''
                if with_placeholder:
                    placeholder = ' [] '
                context = '{}{}{}'.format(' '.join(pre[-margin:]), placeholder,
                                          ' '.join(post[:margin]))
                adj_cit_str = '[{}]'.format('|'.join(adjacent_citations))
                tmp_list.append([aid, adj_cit_str, in_doc, context])
                marker_found = True
            if marker_found:
                num_docs += 1
        if len(tmp_list) >= min_contexts and num_docs > 1:
            contexts.extend(tmp_list)
    print(len(contexts))
    with open('items.csv', 'w') as f:
        for vals in contexts:
            line = '{}\n'.format(','.join(vals))
            f.write(line)
Esempio n. 29
0
def processTrainingData(list_of_tweets):
    list_of_tweets = [prep.strip_short(line) for line in list_of_tweets]
    list_of_tweets = prep.preprocess_documents(list_of_tweets)
    return list_of_tweets
Esempio n. 30
0
def main(param_file=None):

    # setup
    p, base_path, output_dir = tools.setup(param_file)
    result_path = path.join(base_path, p['result_path'])
    lee_corpus = path.join(base_path, p['lee_corpus'])
    logger = tools.get_logger('gensim', path.join(output_dir, "run.log"))
    logger.info("running %s" % ' '.join(sys.argv))

    # remember starting time for runtime evaluation
    start = datetime.now()

    # load model and corpus
    logger.info('loading word mapping')
    dictionary = Dictionary.load(path.join(result_path,
                                           p['run'], p['dict_extension']))

    model_path = path.join(result_path, p['run'], p['lsi_ext'])
    logger.info('load model from: %s' % model_path)
    lsi = LsiModel.load(model_path)
    pre = SaveLoad.load(path.join(result_path, p['run'], p['pre_model_ext']))

    logging.info('load smal lee corpus and preprocess')
    with open(lee_corpus, 'r') as f:
        preproc_lee_texts = preprocessing.preprocess_documents(f.readlines())
    bow_lee_texts = [dictionary.doc2bow(text,
                                        allow_update=False,
                                        return_missing=False)
                    for text in preproc_lee_texts]

    logger.info('transforming small lee corpus (only pre model)')
    corpus_pre = pre[bow_lee_texts]

    # read the human similarity data and flatten upper triangular
    human_sim_matrix = np.loadtxt(path.join(base_path, p['human_data_file']))
    sim_m_size = np.shape(human_sim_matrix)[0]
    human_sim_vector = human_sim_matrix[np.triu_indices(sim_m_size, 1)]

    max_topics = lsi.num_topics

    logger.info("iterate from %d to %d dimensions (stepsize: %d)" %
                (p['min_dim'], max_topics, p['dim_step']))

    iter_range = range(p['min_dim'], max_topics, p['dim_step'])
    res = np.zeros(len(iter_range))
    for k, l in enumerate(iter_range):

        # do the lower dimensionality transformation
        lsi.num_topics = l
        corpus_lsi = lsi[corpus_pre]

        # compute pairwise similarity matrix of transformed corpus
        sim_matrix = np.zeros((len(corpus_lsi), len(corpus_lsi)))
        for i, par1 in enumerate(corpus_lsi):
            for j, par2 in enumerate(corpus_lsi):
                sim_matrix[i, j] = matutils.cossim(par1, par2)
        sim_vector = sim_matrix[np.triu_indices(len(corpus_lsi), 1)]

        # compute correlations
        cor = np.corrcoef(sim_vector, human_sim_vector)
        logger.info("step %d: correlation with lee data: %f" % (k, cor[0, 1]))
        res[k] = cor[0, 1]

    plt.figure()
    plt.plot(iter_range, res)
    plt.savefig(os.path.join(output_dir, 'cor_plot.' + p['plot_extension']))
    plt.close()
    np.save(path.join(output_dir, 'model_dim_res.npy'), res)

    dif = datetime.now() - start
    logger.info("finished after %d days and %d secs" % (dif.days, dif.seconds))
Esempio n. 31
0
def LSI(polarity_cleaned_data, LSI_input):
    df = pd.read_json(polarity_cleaned_data, orient='split')
    text_corpus = df['Comment']

    processed_corpus = preprocess_documents(text_corpus)
    dictionary = gensim.corpora.Dictionary(processed_corpus)
    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

    lsi = gensim.models.LsiModel(bow_corpus, num_topics=200)
    index = gensim.similarities.MatrixSimilarity(lsi[bow_corpus])

    new_doc = gensim.parsing.preprocessing.preprocess_string(LSI_input)
    new_vec = dictionary.doc2bow(new_doc)
    vec_bow_tfidf = lsi[new_vec]

    sims = index[vec_bow_tfidf]

    comment_list = []
    cosine_similarity = []
    comment_polarity = []
    comment_subjectivity = []
    comment_upvotes = []
    for s in sorted(enumerate(sims), key=lambda item: -item[1])[:10]:
        comment_list.append(f"{df['Comment'].iloc[s[0]]}")
        cosine_similarity.append(s[1])
        comment_polarity.append(df['Polarity'].iloc[s[0]])
        comment_subjectivity.append(df['Subjectivity'].iloc[s[0]])
        comment_upvotes.append(df['Upvotes'].iloc[s[0]])

    d = {
        'Cosine Similarity': cosine_similarity,
        'Comments': comment_list,
        'Polarity': comment_polarity,
        'Subjectivity': comment_subjectivity,
        'Upvotes': comment_upvotes
    }
    LSI_df = pd.DataFrame(d)

    ## averages for top 10 comment results
    columns = ['Polarity', 'Subjectivity', 'Cosine Similarity']
    averages = [
        round(LSI_df['Polarity'].mean(), 2),
        round(LSI_df['Subjectivity'].mean(), 2),
        round(LSI_df['Cosine Similarity'].mean(), 2)
    ]

    fig5 = go.Figure(
        data=[go.Bar(x=columns, y=averages, marker=dict(color='#ffb300'))])
    fig5.update_layout(
        font=dict(color='#ff9100'),
        title='Statistical Averages for Search Results',
        xaxis=dict(
            title='Comments (from highest cosine similarity to lowest)', ),
        yaxis=dict(
            title='Polarity, Subjectivity and Cosine Similarity Averages',
            gridcolor='darkgray'),
        plot_bgcolor='#212121',
        paper_bgcolor='#212121'),
    fig5.update_traces(opacity=.75)

    return html.Div(children=[
        html.Div(children=[
            dash_table.DataTable(
                columns=[{
                    'name': i,
                    'id': i
                } for i in LSI_df.columns],
                style_table={'overflow': 'auto'},
                data=LSI_df.to_dict('records'),
                style_cell={
                    'textAlign': 'left',
                    'whiteSpace': 'normal',
                    'font-family': 'Helvetica',
                    'font-weight': 'ligher',
                    'height': 'auto',
                    'backgroundColor': '#1a1a1a',
                    'color': 'darkgray'
                },
                style_header={
                    'font-weight': 'bold',
                },
                css=[{
                    'selector':
                    '.dash-spreadsheet td div',
                    'rule':
                    '''
                                line-height: 15px;
                                max-height: 30px; min-height: 30px; height: 30px;
                                display: block;
                                overflow-y: hidden;
                            '''
                }],
                tooltip_duration=None,
                tooltip_data=[{
                    column: {
                        'value': str(value),
                        'type': 'markdown'
                    }
                    for column, value in row.items()
                } for row in LSI_df.to_dict('records')],
            )
        ],
                 className='datatable'),
        html.Div(className='LSI-bar', children=[dcc.Graph(figure=fig5)])
    ])
Esempio n. 32
0
# Latent schematic indexing (LSI) is an nlp technique used to find similar
# pieces of text. In this example, given a corpus of movie reviews, one can
# find a review most similar to an input string. LSI works by using singular
# value decomposition (SVD) which is non centered PCA. It is good at dealing
# with synonymy and polysemy in languages but is computationally expensive, and
# as such is not recommended for processing documents in bulk.
import pandas as pd
import gensim
from gensim.parsing.preprocessing import preprocess_documents

df = pd.read_csv(‘wiki_movie_plots_deduped.csv’, sep=’,’)
df = df[df[‘Release Year’] >= 2000]
text_corpus = df[‘Plot’].values

processed_corpus = preprocess_documents(text_corpus)
dictionary = gensim.corpora.Dictionary(processed_corpus)
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

tfidf = gensim.models.TfidfModel(bow_corpus, smartirs=’npu’)
corpus_tfidf = tfidf[bow_corpus]

# num_topics is a hyperparameter that can be fine tuned using Topic Coherence measure
lsi = gensim.models.LsiModel(corpus_tfidf, num_topics=200)
index = gensim.similarities.MatrixSimilarity(lsi[corpus_tfidf])

new_doc = gensim.parsing.preprocessing.preprocess_string(new_doc)
new_vec = dictionary.doc2bow(new_doc)
vec_bow_tfidf = tfidf[new_vec]
vec_lsi = lsi[vec_bow_tfidf]
sims = index[vec_lsi]
for s in sorted(enumerate(sims), key=lambda item: -item[1])[:10]: