def processFile(self, file_path_and_name): try: f = open(file_path_and_name, 'r') text_0 = f.read() text_1 = re.search(r"<TEXT>.*</TEXT>", text_0, re.DOTALL) text_1 = re.sub("<TEXT>\n", "", text_1.group(0)) text_1 = re.sub("\n</TEXT>", "", text_1) text_1 = re.sub("<P>", "", text_1) text_1 = re.sub("</P>", "", text_1) text_1 = re.sub("\n", " ", text_1) text_1 = re.sub("\"", "\"", text_1) text_1 = re.sub("''", "\"", text_1) text_1 = re.sub("``", "\"", text_1) text_1 = re.sub(" +", " ", text_1) text_1 = re.sub(" _ ", "", text_1) text_1 = re.sub(r"\(AP\) _", " ", text_1) text_1 = re.sub("&\w+;", " ", text_1) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') lines = sent_tokenizer.tokenize(text_1.strip()) index = lines[0].find("--") if index != -1: lines[0] = lines[0][index + 2:] index = lines[0].find(" _ ") if index != -1: lines[0] = lines[0][index + 3:] sentences = [] for sent in lines: sent = sent.strip() OG_sent = sent[:] sent = sent.lower() line = nltk.word_tokenize(sent) stemmed_sentence = [porter.stem(word) for word in line] stemmed_sentence = list(filter(lambda x: x != '.' and x != '`' and x != ',' and x != '_' and x != ';' and x != '(' and x != ')' and x.find('&') == -1 and x != '?' and x != "'" and x != '!' and x != '''"''' and x != '``' and x != '--' and x != ':' and x != "''" and x != "'s", stemmed_sentence)) # stemmed_sentence = [word for word in stemmed_sentence if word not in stopwords.words('english')] if (len(stemmed_sentence) <= 4): continue if stemmed_sentence: sentences.append(Sentence(file_path_and_name, stemmed_sentence, OG_sent)) return sentences except IOError: print('Oops! File not found', file_path_and_name) return [Sentence(file_path_and_name, [], [])]
def _read_file(src_file): data = [] lemma_count = {} total_samples = 0 src = open(src_file, "rt", encoding="utf-8") # init lists words = [] lemma_words = [] is_prep = [] tree = [] for row in src: if row == "\n": # create sentence object data.append(Sentence(words, lemma_words, is_prep, tree)) # init lists words = [] lemma_words = [] is_prep = [] tree = [] continue # read file tree_id, word, lemma, _, pos, _, parent_idx, context, _, _ = row.split() total_samples += 1 lemma_count[lemma] = lemma_count.get(lemma, 0) + 1 # fill list with relevant data from the file words.append(word) lemma_words.append(lemma) is_prep.append(True if pos in PREP else False) tree.append((int(parent_idx) - 1, context)) return total_samples, lemma_count, data
def buildQuery(self, sentences, TF_IDF_w, n): scores = list(TF_IDF_w.keys()) scores.sort(reverse=True) i = 0 j = 0 queryWords = [] while (i < n): words = TF_IDF_w[scores[j]] for word in words: queryWords.append(word) i = i + 1 if (i > n): break j = j + 1 return Sentence("query", queryWords, queryWords)
def sentence2vec(self, sentence): sentence = Sentence(sentence, self.seg) vec_bow = self.dictionary.doc2bow(sentence.get_cuted_sentence()) return self.model[vec_bow]
def set_sentences(self, sentences): self.sentences = [] for i in range(0, len(sentences)): self.sentences.append(Sentence(sentences[i], self.seg, i))