def remove_common_words(self, words): """ Removes a word or list of words that are considered common from all self.docs collection :param words: string or list of strings :return: """ # if words is a single word convet it into a list for versatility of application of next lines of code if isinstance(words, str): words = [words] # if words are not stemmed we want to also remove their stemmed version from self.docs, so add them to words """ stemmed_words = [tm.stem(word) for word in words] words = words.extend(word for word in stemmed_words if word not in words) """ words.extend( tm.stem(word) for word in words if tm.stem(word) not in words) # remove words from self.docs self.docs = [ filter(lambda word: word not in words, doc) for doc in self.docs ]
def __init__(self, controlled_vocab_file): self.terms = {} with open(controlled_vocab_file, 'r') as infile: for num, line in enumerate(infile): self.terms[line.splitlines()[0]] = [] self.terms_flat = [stem(term) for phrase in self.terms for term in phrase.split()] # maps index_pos to _id self.doc_ids = [] self.tdm = TermDocumentMatrix() # lookup table of index positions for terms on TDM self.term_index_lookup = {} # nd array structure for TDM self.matrix = np.array([]) # output dict; comp site for phrase matching self.out_dict = {} # pseudo hash trick dict self.term_index_dict = {}
def compute_phrase_matches(self): """computes phrase matches as dot products of term vectors """ for search_phrase in self.terms.keys(): terms = [stem(term) for term in search_phrase.split()] # retrieve column vector for unigrams if len(terms) == 1: try: # grab column for term and skip first row self.out_dict[search_phrase] = self.matrix[:, dict[search_phrase]][1:] except (KeyError, TypeError): pass # get vectors to multiply; compute dot product of vectors else: try: dot_product = reduce(operator.__mul__, [self.matrix[:, self.term_index_dict[a]][1:].astype(int) for a in terms], 1) self.out_dict[search_phrase] = dot_product except (KeyError, TypeError): pass tags = {} for search_phrase in self.out_dict: # maintain original doc index pos with enumeration reduced = [a for a in enumerate(self.out_dict[search_phrase]) if a[1] != 0 and a[1] != '0'] for pair in reduced: tags.setdefault(pair[0], []) # doc_index_pos: tagged_phrase tags[pair[0]].append(search_phrase) doc_lookup = {} for _id in enumerate(self.doc_ids): doc_lookup[_id[0]] = _id[1] out = {} for search_phrase in tags: out[doc_lookup[search_phrase]] = tags[search_phrase] # doc id and list of phrase matches; matches = [[i[0], ','.join(i[1])] for i in out.iteritems()] return matches