ww = WebCrawler(hh.merged_result_links_list)
    ww.set_limit_on_output_sentences(SENTENCE_LIMIT)
    ww.min_of_words_in_sentence = MIN_WORD_IN_SENTENCE #minimize this if want to see the frequency of words (but might need to increase the num of sentences)
    ww.parse_all_urls()

    RESULT_FILE = r'c:\data\results_file.txt'
    ## Dump results to text file
    with open(RESULT_FILE,'w') as f:
        for url, desc in zip(ww.list_of_urls, ww.parse_results_list):
            f.write('\n')
            f.write('#'*20)
            f.write('\n')
            f.write(url + '\n')
            f.write('\n')
            f.write(desc.encode(errors = 'ignore') + '\n' + '#'*18 + '\n')

    ## Enable freq note
    print 'Measure phrases Frequency'
    freq_save_filename =  r'C:\data\results_file_freq.txt'

    most_common_phrases_list, phrases_freq_list = Pattern_Parsing.retrieve_top_freq_noun_phrases_fr_file(RESULT_FILE, 5000, 100, freq_save_filename)

    for (phrase, freq) in phrases_freq_list:
        print phrase, '  ', freq





Beispiel #2
0
    hh.consolidated_results()

    print 'End Search'
    print 'Start crawling individual results'

    ww = WebCrawler(hh.merged_result_links_list)
    ww.set_limit_on_output_sentences(SENTENCE_LIMIT)
    ww.min_of_words_in_sentence = MIN_WORD_IN_SENTENCE  #minimize this if want to see the frequency of words (but might need to increase the num of sentences)
    ww.parse_all_urls()

    RESULT_FILE = r'c:\data\results_file.txt'
    ## Dump results to text file
    with open(RESULT_FILE, 'w') as f:
        for url, desc in zip(ww.list_of_urls, ww.parse_results_list):
            f.write('\n')
            f.write('#' * 20)
            f.write('\n')
            f.write(url + '\n')
            f.write('\n')
            f.write(desc.encode(errors='ignore') + '\n' + '#' * 18 + '\n')

    ## Enable freq note
    print 'Measure phrases Frequency'
    freq_save_filename = r'C:\data\results_file_freq.txt'

    most_common_phrases_list, phrases_freq_list = Pattern_Parsing.retrieve_top_freq_noun_phrases_fr_file(
        RESULT_FILE, 5000, 100, freq_save_filename)

    for (phrase, freq) in phrases_freq_list:
        print phrase, '  ', freq