コード例 #1
0
def test_parallel(start):
    pairs_file = open(os.path.join(os.pardir, "data", "pairs.pkl"), 'rb')
    pairs = pickle.load(pairs_file)
    pairs_file.close()

    synonyms_file = open(os.path.join(os.pardir, "data", "synonyms.pkl"), 'rb')
    synonyms = pickle.load(synonyms_file)
    synonyms_file.close()
    pattern_matcher = PatternMatcher()
    connection = pymysql.connect(host='localhost',
                                 user='******',
                                 password='******'.format(sys.argv[1]),
                                 db='stackoverflow')
    with connection.cursor() as cursor:
        sql = "SELECT Id, Body FROM Posts WHERE Score >= 0 AND Id >= {} AND Id < {}".format(
            start, start + 100)
        cursor.execute(sql)
        for i in range(cursor.rowcount):
            # post_count += 1
            current_id, row = cursor.fetchone()
            word_list = get_words(row)
            # total_sent_count += len(word_list)

            for words in word_list:
                rtn = check_tech_pairs(words)
                if rtn is not None:
                    words = rtn[0].split(" ")
                    pattern_matcher.match_pattern(words, current_id, rtn[1],
                                                  "keytechs")
コード例 #2
0
    def __init__(self,
                 articles_limit,
                 use_dump=False,
                 randomize=False,
                 match_threshold=0.005,
                 type_matching=True,
                 allow_unknown_entity_types=True,
                 print_interim_results=True,
                 threads=4,
                 resources_path=dir_path +
                 '../data/mappingbased_objects_en.ttl',
                 patterns_input_path=dir_path + '../data/patterns_cleaned.pkl',
                 facts_output_path=dir_path + '../results/extracted_facts.nt',
                 extended_facts_output_path=dir_path +
                 '../results/extracted_facts_extended.txt'):
        super(FactExtractor, self).__init__(patterns_input_path)
        self.articles_limit = articles_limit
        self.use_dump = use_dump
        self.allow_unknown_entity_types = allow_unknown_entity_types
        self.match_threshold = match_threshold
        self.type_matching = type_matching
        self.nt_reader = NTReader(resources_path, randomize)
        self.wikipedia_connector = WikipediaConnector(self.use_dump)
        self.pattern_extractor = PatternExtractor()
        self.pattern_matcher = PatternMatcher()
        self.print_interim_results = print_interim_results
        self.discovery_resources = set()
        self.extracted_facts = []
        self.threads = threads
        self.nt_writer = NTWriter(facts_output_path)
        self.extended_facts_output_path = extended_facts_output_path

        # self._make_pattern_types_transitive()
        self._load_discovery_resources()
コード例 #3
0
def extract_results(labels, original_data, review_bodys):
    # print number of elements in each cluster
    cluster_counts = Counter(labels)
    print(cluster_counts)

    clusters = {}
    pattern_matcher = PatternMatcher()
    # find and print dbscan result on actual text data - review_bodys
    for i in set(labels):
        if i != -1:  # do not print if noise (-1)
            clusters[i] = []
            print(i, "----")
            for x in range(len(review_bodys)):
                if labels[x] == i:

                    print(">>>", (review_bodys[x]))

                    sentence = get_review(review_bodys[x])
                    matches = pattern_matcher.find_matches(sentence)
                    clusters[i].append((review_bodys[x][0], sentence, matches))

                    print(clusters[i], "\n")
    print(clusters)
    input()
    return pattern_matcher.extract_objects(clusters)
コード例 #4
0
def with_patterns_to_pages(query, word):
    # ""で厳密なマッチャで検索する。
    # word => "で", "に" など
    pm = PatternMatcher('"' + word + query + '"')
    pages = pm.bing_search()
    for page in pages:
        page.build_keyword(word + query)
        page.pattern_word = word
        page.query = query
    # page.keywordが''だったら最後に返すpageに入れない
    return [page for page in pages if page.keyword]
コード例 #5
0
        try:
            keyword = page.noun_before_query(page.snippet,
                                             constants.ACTION_WORD_IN_QUERY)
        except (ValueError, IndexError):
            continue
        if keyword:
            keywords.add(keyword)
    print(keywords)

    # 〜〜を使う、の〜〜も最終結果に入れる
    results_dic = {}
    for keyword in keywords:
        results_dic[keyword] = set()

    for keyword in keywords:
        pm = PatternMatcher(constants.QUERY + ' "' + constants.SO_CALLED +
                            keyword + '"')
        keyword_pages = pm.bing_search()
        for page in keyword_pages:
            try:
                result = page.noun_before_query(page.snippet,
                                                constants.SO_CALLED + keyword)
            except (ValueError, IndexError):
                continue
            if result:
                # results_dic[keyword] => set()からset(['アレロック', 'アルガード'])
                results_dic[keyword].add(result)

    with open(constants.PICKLE_RESULT_DICT_NAME, 'wb') as f:
        pickle.dump(results_dic, f)

    pdb.set_trace()
コード例 #6
0
def search_web_pages(query):
    pm = PatternMatcher(query)
    pages = pm.bing_search()
    return pages
コード例 #7
0
import sys
from pattern_matcher import PatternMatcher, PatternNode


if __name__ == '__main__':
    _input = map(lambda line: line.rstrip('\n') , sys.stdin)

    total_patterns = int(_input[0])
    patterns = _input[1 : total_patterns + 1]
    pattern_matcher = PatternMatcher()
    for pattern in patterns:
        pattern_matcher.insert(pattern)

    total_paths = int(_input[total_patterns+1])
    path_starts = total_patterns + 2
    path_ends = path_starts + total_paths
    paths = map(lambda path: path.strip('/'), _input[path_starts : path_ends])
    for path in paths:
        print pattern_matcher.find_match(path)