Beispiel #1
0
def main(argv):
    logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
    
    parser = argparse.ArgumentParser(description='Cluster given HTML and plaintext documents.')
    parser.add_argument('datadir', metavar='datadir', help='directory where documents for clustering are stored')

    parser.add_argument('--usehtml', dest='html_use_tags', action='store_true', default=preprocessing.DEF_HTML_USE_TAGS,
                       help='use HMTL tags for text analysis (default: false)')

    # { ------- DISTANCE FUNCTION SETUP
    dist_group = parser.add_mutually_exclusive_group(required=True)
    dist_group.add_argument('--euclidean', dest='sim_fun', action='store_const', const=preprocessing.DEF_USE_CORRELATION, 
                       help='use euclidean similarity')

    dist_group.add_argument('--correlation', dest='sim_fun', action='store_const', const=preprocessing.DEF_USE_CORRELATION, 
                       help='use correlation similarity')
    
    dist_group.add_argument('--abscorrelation', dest='sim_fun', action='store_const', const=preprocessing.DEF_USE_ABSCORRELATION, 
                       help='use abscorrelation similarity')

    dist_group.add_argument('--unccorrelation', dest='sim_fun', action='store_const', const=preprocessing.DEF_USE_UNCCORRELATION, 
                       help='use uncentered correlation similarity')

    dist_group.add_argument('--spearman', dest='sim_fun', action='store_const', const=preprocessing.DEF_USE_SPEARMAN, 
                       help='use Spearman\'s similarity')
    
    dist_group.add_argument('--kendall', dest='sim_fun', action='store_const', const=preprocessing.DEF_USE_KENDALL, 
                       help='use Kendall\'s similarity')

    dist_group.add_argument('--manhattan', dest='sim_fun', action='store_const', const=preprocessing.DEF_USE_MANHATTAN, 
                       help='use Manhattan similarity')
    # DISTANCE FUNCTION SETUP ------- }

    # { ------- CLUSTER CENTER METHOD
    method_group = parser.add_mutually_exclusive_group(required=True)
    method_group.add_argument('--arithmetic', dest='center_method', action='store_const', const=preprocessing.DEF_USE_ARITHMETIC, 
                       help='use arithmetic mean to find cluster center')

    method_group.add_argument('--median', dest='center_method', action='store_const', const=preprocessing.DEF_USE_MEDIAN, 
                       help='use median to find cluster center')
    # CLUSTER CENTER METHOD ------- }

    parser.add_argument('--title', dest='html_title_weight', action='store', type=int, default=preprocessing.DEF_HTML_TITLE_WEIGHT, 
                       help='title weight (default: 1)')

    parser.add_argument('--h1', dest='html_h1_weight', action='store', type=int, default=preprocessing.DEF_HTML_TITLE_WEIGHT, 
                       help='title weight (default: 1)')

    parser.add_argument('--freq', dest='top_freq_terms', action='store', type=int, default=preprocessing.DEF_TOP_FREQ_TERMS, 
                       help='number of top fequent terms for clustering (default: 2000)')
    
    parser.add_argument('--groups', dest='group_cnt', action='store', type=int, default=preprocessing.DEF_GROUP_CNT, 
                       help='number of groups (default: 3)')

    parser.add_argument('--repeats', dest='repeats', action='store', type=int, default=preprocessing.DEF_REPEATS, 
                       help='repeats in KMeans algorithm (default: 20)')

    args = parser.parse_args()
    html_use_tags     = args.html_use_tags
    html_title_weight = args.html_title_weight
    html_h1_weight    = args.html_h1_weight
    top_freq_terms    = args.top_freq_terms
    group_cnt         = args.group_cnt
    repeats           = args.repeats
    use_simfun        = args.sim_fun
    center_method     = args.center_method

    html_conf = {'usehtml': html_use_tags, 'title': html_title_weight, 'h1': html_h1_weight}

    docs, terms = preprocessing.process_documents(argv[0], html_conf)
    result = preprocessing.cluster(docs, terms, top_freq_terms, group_cnt, use_simfun, repeats, center_method)

    r = sorted(result, key = lambda i: i[1])
        

    print "\n".join( v[0] + ", " + str(v[1]) for v in r)

    purity.purity(r, group_cnt)
    for c,f,i in regexpes:
        print '---', c, '=', str(len(f))
        print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n'

    return mysekitei, regexpes


if __name__ == "__main__":

    if len(sys.argv) > 2:
        file_exm = sys.argv[2]
        file_gen = sys.argv[1]
    else: raise ValueError
        
    with open(file_exm, 'r') as f:
        good_urls = f.readlines()

    with open(file_gen, 'r') as f:
        urls = f.readlines()

    good_urls = [ re.sub(ur'\r?\n', u'', url.lower()) for url in good_urls ]
    urls      = [ re.sub(ur'\r?\n', u'', url.lower()) for url in urls ]

    if len(sys.argv) > 3:
        mysekitei, regexpes = read_clusters(sys.argv[3])
        print purity(mysekitei, regexpes, good_urls, urls, 500, 25)
    else:
        mysekitei, regexpes = get_clusters(good_urls, urls)
        som_create_and_save(mysekitei, regexpes, good_urls, urls)

Beispiel #3
0
    if not os.path.exists('data'):
        os.makedirs('data')

    if len(sys.argv) > 2:
        file_exm = sys.argv[2]
        file_gen = sys.argv[1]
    else:
        raise ValueError
        
    with open(file_exm, 'r') as f:
        good_urls = f.readlines()

    with open(file_gen, 'r') as f:
        urls = f.readlines()

    good_urls = [ re.sub(ur'\r?\n', u'', url.lower()) for url in good_urls ]
    urls      = [ re.sub(ur'\r?\n', u'', url.lower()) for url in urls ]

    if len(sys.argv) > 3:
        # clusters_filename='data/clusters_freq_features.txt'
        mysekitei, regexpes = read_clusters(sys.argv[3])
        res = purity(mysekitei, regexpes, good_urls, urls, 500, 25)

        print res
        with open('data/purity.txt', 'w') as file:
            print >>file, res      
    else:
        mysekitei, regexpes = get_clusters(good_urls, urls)
        som_create_and_save(mysekitei, regexpes, good_urls, urls)