Beispiel #1
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    for count in range(2, len(urls) + 1):

        print '[learner] clustering with %d urls' % count

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)

        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(
                json.dumps(clusters, indent=2,
                           ensure_ascii=False).encode('utf8'))
Beispiel #2
0
def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'extractor.coffee')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        # skip already extracted
        if os.path.exists(os.path.join(path, '%03d.json' % id)):
            continue

        print '[extractor] #%03d: %s' % (id, url)
        subprocess.call(
            'cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1'
            % {
                'path': path,
                'extractor': extractor,
                'url': url,
                'label': id,
            },
            shell=True)
Beispiel #3
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    data = []
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        print '[diffbot] #%03d: %s' % (id, url)
        response = urllib2.urlopen(
            'http://www.diffbot.com/api/article?' +
            urllib.urlencode({
                'url': url,
                'token': '4bc6e407da88dd8723c70a5297cdf7fb',
                'timeout': '60000',
            }))

        data.append(json.loads(response.read()))

    with open(os.path.join(path, 'diffbot.json'), 'w') as f:
        f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))
Beispiel #4
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    for count in range(2, len(urls) + 1):

        print '[learner] clustering with %d urls' % count

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)
        
        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
Beispiel #5
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    data = []
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        print '[diffbot] #%03d: %s' % (id, url)
        response = urllib2.urlopen('http://www.diffbot.com/api/article?' + urllib.urlencode({
            'url': url,
            'token': '4bc6e407da88dd8723c70a5297cdf7fb',
            'timeout': '60000',
        }))

        data.append(json.loads(response.read()))

    with open(os.path.join(path, 'diffbot.json'), 'w') as f:
        f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))
Beispiel #6
0
def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'extractor.coffee')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # extract data from each url
    for id, url in enumerate(urls):
        url = url.strip()
        if not url:
            continue

        # skip already extracted
        if os.path.exists(os.path.join(path, '%03d.json' % id)):
            continue

        print '[extractor] #%03d: %s' % (id, url)
        subprocess.call('cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1' % {
            'path': path,
            'extractor': extractor,
            'url': url,
            'label': id,
        }, shell=True)
Beispiel #7
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)
    '''name=[]
    start=[]
    base=[]
    proc=[]
    urls=[]
    site_data= requests.get("http://1.7.151.12:8181/api/scraper_api/fetch_links.php").json()
    for sd in site_data:
        name ,start, base, proc=extract(sd)
        urls.append(base)'''

    for count in range(2, len(urls) + 1):

        print('[learner] clustering with %d urls' % count)

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)

        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(
                json.dumps(clusters, indent=2,
                           ensure_ascii=False).encode('utf8'))
Beispiel #8
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
        random.shuffle(page['texts'])

    # process data
    processor = processors.Processor(data,
                                     tokenizer=tokenizers.GenericTokenizer,
                                     analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()

    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, labels = processor.prepare(labels)

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)
    print(features.shape)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels),
                                n_folds=4,
                                shuffle=False,
                                random_state=0)
    for train_index, test_index in rs:
        print('training size = %d, testing size = %d' %
              (len(train_index), len(test_index)))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print(clf.n_support_)

        print("training:")
        predicted = clf.predict(features[train_index])
        print(classification_report(labels[train_index], predicted))

        print("testing:")
        predicted = clf.predict(features[test_index])
        print(classification_report(labels[test_index], predicted))

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label]))

    return

    negatives = []
    positives = []
    for i in range(len(processor.texts)):
        if labels[i]:
            positives.append(processor.texts[i])
        else:
            negatives.append(processor.texts[i])

    stats(negatives, positives)

    return
    """
Beispiel #9
0
def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'label.py')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load each JSON file from chaos.
    # Read each block of that file.
    # [P2] Sort the blocks by their size.
    # Also load the gold-text of that file.
    # If matching between gold-text and that element text is
    #   above a certain threshold, label that block as 1.
    # [P2] remove the matching part from gold-text.
    # Rewrite the blocks to another json file.

    # extract data from each url

    # load data
    pages = []
    domains = collections.defaultdict(lambda: 0)

    for id, url in enumerate(urls):
        if not url.strip():
            continue
        
        host = url.split('/', 3)[2]
        #if domains[host] > 2:
        #    continue
        domains[host] += 1
        print host

        page = utils.load_data(path, id)
        processor = processors.Processor([page], tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        clusters = collections.defaultdict(list)
        for text, label in zip(processor.texts, labels):
            clusters[int(label)].append(text)

        gold_text = utils.load_gold_text(path, id)
        gold_text = processor.tokenizer.tokenize(gold_text)

        max_score = 0
        best_label = None
        for label, texts in clusters.iteritems():
            tokens = ''
            for text in texts:
                tokens += text['tokens']
            score = processor.analyzer.get_similarity(tokens, gold_text)
            if score > max_score:
                max_score = score
                best_label = label

        for text in clusters[best_label]:
            text['label'] = 1


        page_texts = []
        for label, texts in clusters.iteritems():
            page_texts += texts
        random.shuffle(page_texts)
        pages.append(page_texts)

    #random.shuffle(pages)

    continuous_features = []
    discrete_features = []
    labels = []

    for page in pages:
        for text in page:
            text_length = len(text['tokens'])
            area = text['bound']['height'] * text['bound']['width']
            text_density = float(text_length) / float(area)

            # continuous_feature
            continuous_feature = [] #text_length, text_density]
            continuous_features.append(continuous_feature)

            # discrete features
            discrete_feature = dict()
            discrete_feature = dict(text['computed'].items())
            discrete_feature['path'] = ' > '.join(text['path'])
            """
            discrete_feature['selector'] = ' > '.join([
                '%s%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                    '.' + '.'.join(selector['classes']) if selector['classes'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_feature['class'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '.' + '.'.join(selector['classes']) if selector['classes'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_feature['id'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_features.append(discrete_feature)

            # label
            labels.append(text['label'])

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)
    print features.shape

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        print 'training size = %d, testing size = %d' % (len(train_index), len(test_index))

        clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print clf.n_support_

        """
        negatives = []
        for i in clf.support_[:clf.n_support_[0]]:
            negatives.append(all_texts[i])

        positives = []
        for i in clf.support_[clf.n_support_[0]:]:
            positives.append(all_texts[i])

        stats(negatives, positives)
        """

        print "training:"
        predicted = clf.predict(features[train_index])
        print classification_report(labels[train_index], predicted)

        print "testing:"
        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])

    return
Beispiel #10
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
        random.shuffle(page["texts"])

    # process data
    processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()

    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, labels = processor.prepare(labels)

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)
    print features.shape

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        print "training size = %d, testing size = %d" % (len(train_index), len(test_index))

        clf = svm.SVC(
            verbose=False, kernel="linear", probability=False, random_state=0, cache_size=2000, class_weight="auto"
        )
        clf.fit(features[train_index], labels[train_index])

        print clf.n_support_

        print "training:"
        predicted = clf.predict(features[train_index])
        print classification_report(labels[train_index], predicted)

        print "testing:"
        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print "%f\t%f\t%f\t%f" % (precisions[label], recalls[label], f1scores[label], supports[label])

    return

    negatives = []
    positives = []
    for i in range(len(processor.texts)):
        if labels[i]:
            positives.append(processor.texts[i])
        else:
            negatives.append(processor.texts[i])

    stats(negatives, positives)

    return

    """
Beispiel #11
0
def main(args):
    # path = utils.get_data_path(args.site[0])

    sites = ['safari']

    all_continuous_features = []
    all_discrete_features = []
    all_labels = []

    for site in sites:
        print('clustering %s ...' % site)

        path = utils.get_data_path(site)
        urls = utils.load_urls(path)

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]

        # process data
        processor = processors.Processor(data,
                                         tokenizer=tokenizers.GenericTokenizer,
                                         analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # prepare features
        continuous_features, discrete_features, labels, clusters = processor.prepare(
            labels)
        all_continuous_features += continuous_features
        all_discrete_features += discrete_features
        all_labels += labels

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(
        all_discrete_features).toarray()
    continuous_features = np.array(all_continuous_features)
    labels = np.array(all_labels).astype(np.float32)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = KFold(4).split(labels)
    # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        # print training size = %d, testing size = %d' % (len(train_index), len(test_index))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='balanced')
        clf.fit(features[train_index], labels[train_index])

        predicted = clf.predict(features[test_index])
        print(classification_report(labels[test_index], predicted))

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label]))

    ham = collections.defaultdict(dict)
    spam = collections.defaultdict(dict)

    pageId = 0
    for id, cluster in clusters.items():
        for page in cluster['pages'].values():
            content = ''
            for text in page['texts']:
                content += ' '.join(text['text'])

            if cluster['label'] is 1:
                ham[pageId][id] = content
            else:
                spam[pageId][id] = content
            pageId = pageId + 1

    with open(os.path.join(path, 'svm.json'), 'wb') as f:
        f.write(
            json.dumps({
                'ham': ham,
                'spam': spam
            },
                       indent=2,
                       ensure_ascii=False).encode('utf8'))

    return
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)
    #print path
    count=0
    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
       # print count
        #count+=1
        random.shuffle(page['texts'])

    # process data
    processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()
    #print len(features)
    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path)

    res=[texts1,labels]
   # print len(texts1)
    lab=[]
    urlss=[]
    for k in texts1:
        lab.append(k.encode('ascii','ignore'))
    
    for l in urls:
        urlss.append(l.encode('ascii','ignore'))
    # decode the uncodes into string variable
    with open("rohit.csv","w") as fp:
	    writer = csv.writer(fp)
	    for row in zip(urls, lab, labels,classes):
		    writer.writerow(row)


    input("enter data ")
    
    classes=[]
    with open("rohit.csv","r") as fp:
	    reader = csv.reader(fp)
	    for row in reader:
		    classes += [row[3]]
    # Label the dataset and give them classes to which they belong.Classes are in 4th column
    for i in xrange(1,len(classes)):
        if classes[i] == 0:
            classes[i]= cluster_labels[i]
    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    discrete_features.resize(len(discrete_features), 10000) 
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)
    #print len(discrete_features[2])
    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)
    #print features
    # scale features
    features = preprocessing.scale(features)
    #preprocess the features
    rf=RandomForestClassifier(n_estimators=300)
    rf.fit(features,classes)
    #make a randomforest model and fit into them features and classes
    filename = '/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl'
    _ = joblib.dump(rf, filename, compress=9)
    rf = joblib.load(filename)
    #dump the model file into the particular directory
    precisions = []
    recalls = []
    f1scores = []
    supports = []

    return
Beispiel #13
0
def main(args):
    # path = utils.get_data_path(args.site[0])

    sites = ['theverge', 'sina', 'qq', 'techcrunch', 'usatoday', 'npr', 'prothomalo']

    all_continuous_features = []
    all_discrete_features= []
    all_labels = []

    for site in sites:
        print 'clustering %s ...' % site

        path = utils.get_data_path(site)
        urls = utils.load_urls(path)

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]

        # process data
        processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # prepare features
        continuous_features, discrete_features, labels = processor.prepare(labels)
        all_continuous_features += continuous_features
        all_discrete_features += discrete_features
        all_labels += labels


    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(all_discrete_features).toarray()
    continuous_features = np.array(all_continuous_features)
    labels = np.array(all_labels).astype(np.float32)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        print 'training size = %d, testing size = %d' % (len(train_index), len(test_index))

        clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])

    return

    """
Beispiel #14
0
def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'label.py')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load each JSON file from chaos.
    # Read each block of that file.
    # [P2] Sort the blocks by their size.
    # Also load the gold-text of that file.
    # If matching between gold-text and that element text is
    #   above a certain threshold, label that block as 1.
    # [P2] remove the matching part from gold-text.
    # Rewrite the blocks to another json file.

    # extract data from each url

    # load data
    pages = []
    domains = collections.defaultdict(lambda: 0)

    for id, url in enumerate(urls):
        if not url.strip():
            continue

        host = url.split('/', 3)[2]
        #if domains[host] > 2:
        #    continue
        domains[host] += 1
        print(host)

        page = utils.load_data(path, id)
        processor = processors.Processor([page],
                                         tokenizer=tokenizers.GenericTokenizer,
                                         analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        clusters = collections.defaultdict(list)
        for text, label in zip(processor.texts, labels):
            clusters[int(label)].append(text)

        gold_text = utils.load_gold_text(path, id)
        gold_text = processor.tokenizer.tokenize(gold_text)

        max_score = 0
        best_label = None
        for label, texts in clusters.items():
            tokens = ''
            for text in texts:
                tokens += text['tokens']
            score = processor.analyzer.get_similarity(tokens, gold_text)
            if score > max_score:
                max_score = score
                best_label = label

        for text in clusters[best_label]:
            text['label'] = 1

        page_texts = []
        for label, texts in clusters.items():
            page_texts += texts
        random.shuffle(page_texts)
        pages.append(page_texts)

    #random.shuffle(pages)

    continuous_features = []
    discrete_features = []
    labels = []

    for page in pages:
        for text in page:
            text_length = len(text['tokens'])
            area = text['bound']['height'] * text['bound']['width']
            text_density = float(text_length) / float(area)

            # continuous_feature
            continuous_feature = []  #text_length, text_density]
            continuous_features.append(continuous_feature)

            # discrete features
            discrete_feature = dict()
            discrete_feature = dict(text['computed'].items())
            discrete_feature['path'] = ' > '.join(text['path'])
            """
            discrete_feature['selector'] = ' > '.join([
                '%s%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                    '.' + '.'.join(selector['classes']) if selector['classes'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_feature['class'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '.' + '.'.join(selector['classes'])
                    if selector['classes'] else '',
                ) for selector in text['selector']
            ])
            """
            discrete_feature['id'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_features.append(discrete_feature)

            # label
            labels.append(text['label'])

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)
    print(features.shape)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    rs = KFold(4).split(labels)
    for train_index, test_index in rs:
        print('training size = %d, testing size = %d' %
              (len(train_index), len(test_index)))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print(clf.n_support_)
        """
        negatives = []
        for i in clf.support_[:clf.n_support_[0]]:
            negatives.append(all_texts[i])

        positives = []
        for i in clf.support_[clf.n_support_[0]:]:
            positives.append(all_texts[i])

        stats(negatives, positives)
        """

        print("training:")
        predicted = clf.predict(features[train_index])
        print(classification_report(labels[train_index], predicted))

        print("testing:")
        predicted = clf.predict(features[test_index])
        print(classification_report(labels[test_index], predicted))

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label]))

    return
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)
    #print path
    count=0
    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
       # print count
        #count+=1
        random.shuffle(page['texts'])

    # process data
    processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()
    #print len(features)
    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path)
    #It retrieves all the dataset which gets after getting clustered and store them in various lists
    lab=[]
    urlss=[]
    for k in texts1:
        lab.append(k.encode('ascii','ignore'))
    
    #It decodes the unicode into text 
    
    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    #This is the feature extraction part which I have mentioned in my doc
    discrete_features.resize(len(discrete_features), 10000) 
    #resize the discreet_features array to a uniform size so that in further using it model and test data set have same length features array
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)

    features = preprocessing.scale(features)
    #This is the normalization process where features are preprocessed to a scale
    im1=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/dazedandconfused/000.png")
    im2=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/fruitsofotherhands/000.png")
    im3=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/rohitanurag/000.png")
    im4=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/thegirlwhoreadtoomuch/000.png")
    im5=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/timcotson/000.png")

    imtest=Image.open(path+"/000.png")


    result1=equal(imtest,im1)
    result2=equal(imtest,im2)
    result3=equal(imtest,im3)
    result4=equal(imtest,im4)
    result5=equal(imtest,im5)
    choose=0
    testresult=result1

    if result1 <= testresult:
        choose=1
        testresult=result1

    if result2 <= testresult:
        choose=2
        testresult=result2

    if result3 <= testresult:
        choose=3
        testresult=result3

    if result4 <= testresult:
        choose=4
        testresult=result4

    if result5 <= testresult:
        choose=5
        testresult=result5


    if choose == 1:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfdazedandconfused.joblib.pkl"
    if choose == 2:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rffruitsofother.joblib.pkl"
    if choose == 3:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfrohitanurag.joblib.pkl"
    if choose == 4:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfthegirlwhoused.joblib.pkl"
    if choose == 5:
    	usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rftimscoton.joblib.pkl"
#Here we get the predicted model which we use to predict classes like title , date,paragraphs of blogs

    usemodel = "/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl"
    rf = joblib.load(usemodel)
    #loads the model and then use it for prediction
    predicted = rf.predict(features)
    print usemodel
    for i in xrange(1,len(predicted)):
        print lab[i]
        print  "*********"
        print predicted[i]
        print "**********"
    return