コード例 #1
0
	def processFile(self, file_path_and_name):
		try:

			f = open(file_path_and_name, 'r')
			text_0 = f.read()

			text_1 = re.search(r"<TEXT>.*</TEXT>", text_0, re.DOTALL)
			text_1 = re.sub("<TEXT>\n", "", text_1.group(0))
			text_1 = re.sub("\n</TEXT>", "", text_1)

			text_1 = re.sub("<P>", "", text_1)
			text_1 = re.sub("</P>", "", text_1)
			text_1 = re.sub("\n", " ", text_1)
			text_1 = re.sub("\"", "\"", text_1)
			text_1 = re.sub("''", "\"", text_1)
			text_1 = re.sub("``", "\"", text_1)
			text_1 = re.sub(" +", " ", text_1)
			text_1 = re.sub(" _ ", "", text_1)

			text_1 = re.sub(r"\(AP\) _", " ", text_1)
			text_1 = re.sub("&\w+;", " ", text_1)

			sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
			lines = sent_tokenizer.tokenize(text_1.strip())

			index = lines[0].find("--")
			if index != -1:
				lines[0] = lines[0][index + 2:]
			index = lines[0].find(" _ ")
			if index != -1:
				lines[0] = lines[0][index + 3:]
			sentences = []

			for sent in lines:
				sent = sent.strip()
				OG_sent = sent[:]
				sent = sent.lower()
				line = nltk.word_tokenize(sent)

				stemmed_sentence = [porter.stem(word) for word in line]
				stemmed_sentence = list(filter(lambda x: x != '.' and x != '`' and x != ',' and x != '_' and x != ';'
														 and x != '(' and x != ')' and x.find('&') == -1
														 and x != '?' and x != "'" and x != '!' and x != '''"'''
														 and x != '``' and x != '--' and x != ':'
														 and x != "''" and x != "'s", stemmed_sentence))

				# stemmed_sentence = [word for word in stemmed_sentence if word not in stopwords.words('english')]

				if (len(stemmed_sentence) <= 4):
					continue

				if stemmed_sentence:
					sentences.append(Sentence(file_path_and_name, stemmed_sentence, OG_sent))

			return sentences


		except IOError:
			print('Oops! File not found', file_path_and_name)
			return [Sentence(file_path_and_name, [], [])]
コード例 #2
0
 def _read_file(src_file):
     data = []
     lemma_count = {}
     total_samples = 0
     src = open(src_file, "rt", encoding="utf-8")
     # init lists
     words = []
     lemma_words = []
     is_prep = []
     tree = []
     for row in src:
         if row == "\n":
             # create sentence object
             data.append(Sentence(words, lemma_words, is_prep, tree))
             # init lists
             words = []
             lemma_words = []
             is_prep = []
             tree = []
             continue
         # read file
         tree_id, word, lemma, _,  pos, _, parent_idx, context, _, _ = row.split()
         total_samples += 1
         lemma_count[lemma] = lemma_count.get(lemma, 0) + 1
         # fill list with relevant data from the file
         words.append(word)
         lemma_words.append(lemma)
         is_prep.append(True if pos in PREP else False)
         tree.append((int(parent_idx) - 1, context))
     return total_samples, lemma_count, data
コード例 #3
0
ファイル: process.py プロジェクト: mfomicheva/metric-dev
    def run_processors(self):

        results_target = []
        results_reference = []

        sentences_target = []
        sentences_reference = []

        selected_names = loads(self.config.get('Processors', 'processors'))
        selected_processors = []
        existing_processors = {}
        processors_with_output = []

        for name, my_class in inspect.getmembers(processors):
            existing_processors[name] = my_class

        for proc in selected_names:
            name_class = (proc, existing_processors[proc])
            selected_processors.append(name_class)

        for name, my_class in selected_processors:

            instance = my_class()
            from_file = False
            if self.config.has_option('Processors', 'from_file'):
                if instance.__class__.__name__ in loads(self.config.get('Processors', 'from_file')):
                    from_file = True

            print('Running ' + instance.get_name())
            instance.run(self.config, from_file=from_file)

            print('Getting ' + instance.get_name())
            instance.get(self.config, from_file=from_file)

            print(instance.get_name() + ' ' + 'finished!')

            if instance.get_output() is not None:

                processors_with_output.append((name, my_class))
                results_target.append(instance.get_result_tgt())
                results_reference.append(instance.get_result_ref())

        for i in range(len(results_target[0])):

            my_sentence_tgt = Sentence()
            my_sentence_ref = Sentence()

            for k, (name, my_class) in enumerate(processors_with_output):
                instance = my_class()

                if instance.get_output() is not None:
                    my_sentence_tgt.add_data(instance.get_name(), results_target[k][i])
                    my_sentence_ref.add_data(instance.get_name(), results_reference[k][i])

            sentences_target.append(my_sentence_tgt)
            sentences_reference.append(my_sentence_ref)

        return [sentences_target, sentences_reference]
コード例 #4
0
    def buildQuery(self, sentences, TF_IDF_w, n):
        scores = list(TF_IDF_w.keys())
        scores.sort(reverse=True)

        i = 0
        j = 0
        queryWords = []

        while (i < n):
            words = TF_IDF_w[scores[j]]
            for word in words:
                queryWords.append(word)
                i = i + 1
                if (i > n):
                    break
            j = j + 1

        return Sentence("query", queryWords, queryWords)
コード例 #5
0
    def get_clean_sentence(self, punct_cand, punct_ref, cand, ref):

        clean_cand = Sentence()
        clean_ref = Sentence()

        for method in sorted(cand.keys()):
            if method == 'alignments':
                alignments = self.get_clean_alignments(punct_cand, punct_ref, cand, ref)
                clean_data_cand = alignments
                clean_data_ref = alignments
            else:
                clean_data_cand = self.get_clean_data(cand[method], punct_cand)
                clean_data_ref = self.get_clean_data(ref[method], punct_ref)

            clean_cand.add_data(method, clean_data_cand)
            clean_ref.add_data(method, clean_data_ref)

        return clean_cand, clean_ref
コード例 #6
0
 def sentence2vec(self, sentence):
     sentence = Sentence(sentence, self.seg)
     vec_bow = self.dictionary.doc2bow(sentence.get_cuted_sentence())
     return self.model[vec_bow]
コード例 #7
0
 def set_sentences(self, sentences):
     self.sentences = []
     for i in range(0, len(sentences)):
         self.sentences.append(Sentence(sentences[i], self.seg, i))