class PatternDataHandler(object): __metaclass__ = ABCMeta def __init__(self, initial_patterns_list, input_text_file, data_source_type, read_file_format='rb'): self.read_format = read_file_format self.pattern_to_data = {} #defaultdict(PatternData) patterns_data = [ PatternData(pattern) for pattern in initial_patterns_list ] self.pattern_to_data = Manager().dict( {pattern.formatted: pattern for pattern in patterns_data}) patterns_len = [ len(pattern.split()) for pattern in self.pattern_to_data.keys() ] self.min_pattern_len = min(patterns_len) self.max_pattern_len = max(patterns_len) self.data_wrapper = data_wrapper_factory(input_text_file, data_source_type) def extract_patterns_matching(self): sentence_id = 0 for sentence_data in self.data_wrapper.data_collection(): self.extract_pattern_in_sentence(sentence_data) sentence_id += 1 # print "sentence {}".format(sentence_id) if sentence_id % 10000 == 0: print "finished process sentence {}".format(sentence_id) # break def extract_patterns_from_file(self, file): for sentence_data in self.data_wrapper.get_data_from_single_file(file): self.extract_pattern_in_sentence(sentence_data) with sent_locker: sentence_counter.value += 1 counter = sentence_counter.value # print "p_id = {} sentence {}".format(os.getpid(),counter) if counter % 100000 == 0: print "finished process sentence {}".format(counter) # break def extract_patterns_matching_async(self): startTime = time.time() print "running on {} processors".format(WORKERS) pool = Pool(processes=WORKERS, initargs=(sent_locker, lock, sentence_counter)) pool.map(self.extract_patterns_from_file, self.data_wrapper.ngrams_files) pool.close() pool.join() total_time = time.time() - startTime print "extract_patterns_matching_async running time: {}".format( total_time) # for file in self.data_wrapper.ngrams_files: # self.extract_patterns_from_file(file) @abstractmethod def get_head_noun(self, after_pattern_idx, sentence, wildcard_indexes): return NO_HEAD_NOUN def extract_pattern_in_sentence(self, sentence): ''' :param sentence:(SentenceData) :return: ''' formatted_sentence = [ Pattern.POS_TAG if word_data.pos in Pattern.POSSIBLE_TAGS else word_data.word for word_data in sentence ] # wildcard_indexes = [idx for idx,word_data in enumerate(sentence) if word_data.pos in Pattern.POS_TAG] for i in xrange(0, sentence.len - self.min_pattern_len + 1): for j in xrange(i + self.min_pattern_len, min(i + self.max_pattern_len + 1, sentence.len)): string = " ".join(formatted_sentence[i:j]) if self.pattern_to_data.has_key(string): matched_string = " ".join(sentence.words_sequence[i:j]) wildcard_indexes = [ i + wc_idx for wc_idx in self.pattern_to_data[string].wildcards_idx ] head_noun = self.get_head_noun(j, sentence, wildcard_indexes) matched_string = "{} {}".format(matched_string, head_noun) lock.acquire() pattern_data = self.pattern_to_data[string] pattern_data.add_matching_string(matched_string, sentence.occurrences) self.pattern_to_data[string] = pattern_data lock.release() # self.pattern_to_data[string].add_matching_string(matched_string, sentence.occurrences) break def export_results(self, output_folder): import time localtime = time.asctime(time.localtime(time.time())) output_folder = output_folder + '_' + localtime if not os.path.exists(output_folder): os.makedirs(output_folder) for pattern in self.pattern_to_data.keys(): pattern_data = self.pattern_to_data[pattern] file_name = os.path.join(output_folder, pattern_data.org_pattern) with open(file_name, 'a') as f: rows = [ '{}\t{}'.format(x[0], x[1]) for x in pattern_data.sorted_matching_strings ] f.write("\n".join(rows))