コード例 #1
0
	def processFile(self, file_path_and_name):
		try:

			f = open(file_path_and_name, 'r')
			text_0 = f.read()

			text_1 = re.search(r"<TEXT>.*</TEXT>", text_0, re.DOTALL)
			text_1 = re.sub("<TEXT>\n", "", text_1.group(0))
			text_1 = re.sub("\n</TEXT>", "", text_1)

			text_1 = re.sub("<P>", "", text_1)
			text_1 = re.sub("</P>", "", text_1)
			text_1 = re.sub("\n", " ", text_1)
			text_1 = re.sub("\"", "\"", text_1)
			text_1 = re.sub("''", "\"", text_1)
			text_1 = re.sub("``", "\"", text_1)
			text_1 = re.sub(" +", " ", text_1)
			text_1 = re.sub(" _ ", "", text_1)

			text_1 = re.sub(r"\(AP\) _", " ", text_1)
			text_1 = re.sub("&\w+;", " ", text_1)

			sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
			lines = sent_tokenizer.tokenize(text_1.strip())

			index = lines[0].find("--")
			if index != -1:
				lines[0] = lines[0][index + 2:]
			index = lines[0].find(" _ ")
			if index != -1:
				lines[0] = lines[0][index + 3:]
			sentences = []

			for sent in lines:
				sent = sent.strip()
				OG_sent = sent[:]
				sent = sent.lower()
				line = nltk.word_tokenize(sent)

				stemmed_sentence = [porter.stem(word) for word in line]
				stemmed_sentence = list(filter(lambda x: x != '.' and x != '`' and x != ',' and x != '_' and x != ';'
														 and x != '(' and x != ')' and x.find('&') == -1
														 and x != '?' and x != "'" and x != '!' and x != '''"'''
														 and x != '``' and x != '--' and x != ':'
														 and x != "''" and x != "'s", stemmed_sentence))

				# stemmed_sentence = [word for word in stemmed_sentence if word not in stopwords.words('english')]

				if (len(stemmed_sentence) <= 4):
					continue

				if stemmed_sentence:
					sentences.append(Sentence(file_path_and_name, stemmed_sentence, OG_sent))

			return sentences


		except IOError:
			print('Oops! File not found', file_path_and_name)
			return [Sentence(file_path_and_name, [], [])]
コード例 #2
0
 def _read_file(src_file):
     data = []
     lemma_count = {}
     total_samples = 0
     src = open(src_file, "rt", encoding="utf-8")
     # init lists
     words = []
     lemma_words = []
     is_prep = []
     tree = []
     for row in src:
         if row == "\n":
             # create sentence object
             data.append(Sentence(words, lemma_words, is_prep, tree))
             # init lists
             words = []
             lemma_words = []
             is_prep = []
             tree = []
             continue
         # read file
         tree_id, word, lemma, _,  pos, _, parent_idx, context, _, _ = row.split()
         total_samples += 1
         lemma_count[lemma] = lemma_count.get(lemma, 0) + 1
         # fill list with relevant data from the file
         words.append(word)
         lemma_words.append(lemma)
         is_prep.append(True if pos in PREP else False)
         tree.append((int(parent_idx) - 1, context))
     return total_samples, lemma_count, data
コード例 #3
0
    def buildQuery(self, sentences, TF_IDF_w, n):
        scores = list(TF_IDF_w.keys())
        scores.sort(reverse=True)

        i = 0
        j = 0
        queryWords = []

        while (i < n):
            words = TF_IDF_w[scores[j]]
            for word in words:
                queryWords.append(word)
                i = i + 1
                if (i > n):
                    break
            j = j + 1

        return Sentence("query", queryWords, queryWords)
コード例 #4
0
 def sentence2vec(self, sentence):
     sentence = Sentence(sentence, self.seg)
     vec_bow = self.dictionary.doc2bow(sentence.get_cuted_sentence())
     return self.model[vec_bow]
コード例 #5
0
 def set_sentences(self, sentences):
     self.sentences = []
     for i in range(0, len(sentences)):
         self.sentences.append(Sentence(sentences[i], self.seg, i))