def expand_word_graph(self, input_file, similarity, format='raw', window=10, pos=None): """Expands the word graph using the given document. Args: input_file (str): path to the input file. similarity (float): similarity for weighting edges. format (str): the input format, defaults to raw. window (int): the window within the sentence for connecting two words in the graph, defaults to 10. pos (set): the set of valid pos for words to be considered as nodes in the graph, defaults to (NN, NNS, NNP, NNPS, JJ, JJR, JJS). """ # define default pos tags set if pos is None: pos = set(['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS']) # initialize document loader doc = LoadFile(input_file=input_file, language=self.language) # read document doc.read_document(format=format) # flatten document and initialize nodes sequence = [] for sentence in doc.sentences: for j, node in enumerate(sentence.stems): if node not in self.graph and sentence.pos[j] in pos: self.graph.add_node(node) sequence.append((node, sentence.pos[j])) # loop through sequence to build the edges in the graph for j, node_1 in enumerate(sequence): for k in range(j + 1, min(j + window, len(sequence))): node_2 = sequence[k] if node_1[1] in pos and node_2[1] in pos \ and node_1[0] != node_2[0]: if not self.graph.has_edge(node_1[0], node_2[0]): self.graph.add_edge(node_1[0], node_2[0], weight=0) self.graph[node_1[0]][node_2[0]]['weight'] += similarity
def load_document_as_bos(input_file, format="corenlp", use_lemmas=False, stemmer="porter", stoplist=[]): """Load a document as a bag of stems. Args: input_file (str): path to input file. format (str): the input files format, defaults to corenlp. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. stoplist (list): the stop words for filtering tokens, default to []. """ # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # initialize document vector vector = defaultdict(int) # loop through the sentences for i, sentence in enumerate(doc.sentences): # loop through the tokens for j, stem in enumerate(sentence.stems): # skip stem if it occurs in the stoplist if stem in stoplist: continue # count the occurrence of the stem vector[stem] += 1 return vector
def compute_document_frequency(input_dir, output_file, format="corenlp", extension="xml", use_lemmas=False, stemmer="porter", stoplist=None, delimiter='\t', n=3): """ Compute n-gram document frequencies from a set of input documents. An extra row is added to the output file for specifying the number of documents from which the frequencies were computed (--NB_DOC-- tab XX). Args: input_dir (str): the input directory. output_file (str): the output file. format (str): the input files format, defaults to corenlp. extension (str): file extension for input documents, defaults to xml. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. stoplist (list): the stop words for filtering n-grams, default to None. delimiter (str): the delimiter between n-grams and document frequencies, default to tabulation. n (int): the length for ngrams, defaults to 3. """ # document frequency container frequencies = defaultdict(set) # initialize number of documents nb_documents = 0 # loop throught the documents for input_file in glob.glob(input_dir+'/*.'+extension): logging.info('reading file '+input_file) # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # candidate selection doc.ngram_selection(n=n) # filter candidates containing punctuation marks doc.candidate_filtering(stoplist=stoplist) # loop through candidates for lexical_form in doc.candidates: frequencies[lexical_form].add(input_file) nb_documents += 1 # Dump the df container with gzip.open(output_file, 'wb') as f: # add the number of documents as special token first_line = '--NB_DOC--' + delimiter + str(nb_documents) f.write(first_line.encode('utf-8') + b'\n') for ngram in frequencies: line = ngram + delimiter + str(len(frequencies[ngram])) f.write(line.encode('utf-8') + b'\n')
def compute_lda_model(input_dir, output_file, n_topics=500, format="corenlp", extension="xml", use_lemmas=False, stemmer="porter", language="english"): """ Compute a LDA model from a collection of documents. Latent Dirichlet Allocation is computed using sklearn module. Args: input_dir (str): the input directory. output_file (str): the output file. n_topics (int): number of topics for the LDA model, defaults to 500. format (str): the input files format, defaults to corenlp. extension (str): file extension for input documents, defaults to xml. use_lemmas (bool): whether lemmas from stanford corenlp are used instead of stems (computed by nltk), defaults to False. stemmer (str): the stemmer in nltk to used (if used), defaults to porter. language (str): the language of the documents, used for stop_words in sklearn CountVectorizer, defaults to 'english'. """ # texts container texts = [] # loop throught the documents for input_file in glob.glob(input_dir+'/*.'+extension): logging.info('reading file '+input_file) # initialize load file object doc = LoadFile(input_file) # read the input file doc.read_document(format=format, use_lemmas=use_lemmas, stemmer=stemmer, sep='/') # container for current document text = [] # loop through sentences for sentence in doc.sentences: # get the tokens (stems) from the sentence if they are not # punctuation marks text.extend([ sentence.stems[i] for i in range(sentence.length) \ if not re.search('[^A-Z$]', sentence.pos[i]) ]) # add the document to the texts container texts.append(' '.join(text)) # vectorize dataset # get the stoplist from nltk because CountVectorizer only contains english # stopwords atm tf_vectorizer = CountVectorizer(stop_words=stopwords.words(language)) tf = tf_vectorizer.fit_transform(texts) # extract vocabulary vocabulary = tf_vectorizer.get_feature_names() # create LDA model and train lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=0, learning_method='batch') lda_model.fit(tf) # save all data necessary for later prediction saved_model = (vocabulary, lda_model.components_, lda_model.exp_dirichlet_component_, lda_model.doc_topic_prior_) # Dump the df container logging.info('writing LDA model to '+output_file) with gzip.open(output_file, 'wb') as fp: pickle.dump(saved_model, fp)