def test_parallel(start): pairs_file = open(os.path.join(os.pardir, "data", "pairs.pkl"), 'rb') pairs = pickle.load(pairs_file) pairs_file.close() synonyms_file = open(os.path.join(os.pardir, "data", "synonyms.pkl"), 'rb') synonyms = pickle.load(synonyms_file) synonyms_file.close() pattern_matcher = PatternMatcher() connection = pymysql.connect(host='localhost', user='******', password='******'.format(sys.argv[1]), db='stackoverflow') with connection.cursor() as cursor: sql = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id >= {} AND Id < {}".format( start, start + 100) cursor.execute(sql) for i in range(cursor.rowcount): # post_count += 1 current_id, row = cursor.fetchone() word_list = get_words(row) # total_sent_count += len(word_list) for words in word_list: rtn = check_tech_pairs(words) if rtn is not None: words = rtn[0].split(" ") pattern_matcher.match_pattern(words, current_id, rtn[1], "keytechs")
def __init__(self, articles_limit, use_dump=False, randomize=False, match_threshold=0.005, type_matching=True, allow_unknown_entity_types=True, print_interim_results=True, threads=4, resources_path=dir_path + '../data/mappingbased_objects_en.ttl', patterns_input_path=dir_path + '../data/patterns_cleaned.pkl', facts_output_path=dir_path + '../results/extracted_facts.nt', extended_facts_output_path=dir_path + '../results/extracted_facts_extended.txt'): super(FactExtractor, self).__init__(patterns_input_path) self.articles_limit = articles_limit self.use_dump = use_dump self.allow_unknown_entity_types = allow_unknown_entity_types self.match_threshold = match_threshold self.type_matching = type_matching self.nt_reader = NTReader(resources_path, randomize) self.wikipedia_connector = WikipediaConnector(self.use_dump) self.pattern_extractor = PatternExtractor() self.pattern_matcher = PatternMatcher() self.print_interim_results = print_interim_results self.discovery_resources = set() self.extracted_facts = [] self.threads = threads self.nt_writer = NTWriter(facts_output_path) self.extended_facts_output_path = extended_facts_output_path # self._make_pattern_types_transitive() self._load_discovery_resources()
def extract_results(labels, original_data, review_bodys): # print number of elements in each cluster cluster_counts = Counter(labels) print(cluster_counts) clusters = {} pattern_matcher = PatternMatcher() # find and print dbscan result on actual text data - review_bodys for i in set(labels): if i != -1: # do not print if noise (-1) clusters[i] = [] print(i, "----") for x in range(len(review_bodys)): if labels[x] == i: print(">>>", (review_bodys[x])) sentence = get_review(review_bodys[x]) matches = pattern_matcher.find_matches(sentence) clusters[i].append((review_bodys[x][0], sentence, matches)) print(clusters[i], "\n") print(clusters) input() return pattern_matcher.extract_objects(clusters)
def with_patterns_to_pages(query, word): # ""で厳密なマッチャで検索する。 # word => "で", "に" など pm = PatternMatcher('"' + word + query + '"') pages = pm.bing_search() for page in pages: page.build_keyword(word + query) page.pattern_word = word page.query = query # page.keywordが''だったら最後に返すpageに入れない return [page for page in pages if page.keyword]
try: keyword = page.noun_before_query(page.snippet, constants.ACTION_WORD_IN_QUERY) except (ValueError, IndexError): continue if keyword: keywords.add(keyword) print(keywords) # 〜〜を使う、の〜〜も最終結果に入れる results_dic = {} for keyword in keywords: results_dic[keyword] = set() for keyword in keywords: pm = PatternMatcher(constants.QUERY + ' "' + constants.SO_CALLED + keyword + '"') keyword_pages = pm.bing_search() for page in keyword_pages: try: result = page.noun_before_query(page.snippet, constants.SO_CALLED + keyword) except (ValueError, IndexError): continue if result: # results_dic[keyword] => set()からset(['アレロック', 'アルガード']) results_dic[keyword].add(result) with open(constants.PICKLE_RESULT_DICT_NAME, 'wb') as f: pickle.dump(results_dic, f) pdb.set_trace()
def search_web_pages(query): pm = PatternMatcher(query) pages = pm.bing_search() return pages
import sys from pattern_matcher import PatternMatcher, PatternNode if __name__ == '__main__': _input = map(lambda line: line.rstrip('\n') , sys.stdin) total_patterns = int(_input[0]) patterns = _input[1 : total_patterns + 1] pattern_matcher = PatternMatcher() for pattern in patterns: pattern_matcher.insert(pattern) total_paths = int(_input[total_patterns+1]) path_starts = total_patterns + 2 path_ends = path_starts + total_paths paths = map(lambda path: path.strip('/'), _input[path_starts : path_ends]) for path in paths: print pattern_matcher.find_match(path)