Example #1
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    for count in range(2, len(urls) + 1):

        print '[learner] clustering with %d urls' % count

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)

        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(
                json.dumps(clusters, indent=2,
                           ensure_ascii=False).encode('utf8'))
Example #2
0
def learn(site):

    # do we need to relearn?
    count = settings.REDIS_SYNC.hlen('spider:site:%s' % site)
    if settings.REDIS_SYNC.get('spider:site:%s:learned' % site) == count:
        return
    settings.REDIS_SYNC.set('spider:site:%s:learned' % site, count)

    print 'site = %s, count = %d' % (site, count)

    # load data from redis
    data = settings.REDIS_SYNC.hgetall('spider:site:%s' % site)
    data = [pickle.loads(zlib.decompress(data)) for data in data.values()]

    # process data
    processor = processors.Processor(data)
    features = processor.extract()

    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # score
    clusters = processor.score(labels)

    # selecting selectors
    selectors = []
    for cluster in clusters:
        if cluster['confidence'] < settings.CONFIDENCE and cluster[
                'score'] < settings.SCORE:
            continue
        for selector in cluster['selectors'].values():
            if selector and selector[-1]['name'] != 'a':
                selectors.append(selector)

    selectors = ','.join(utils.consolidate_selectors(selectors).keys())
    settings.REDIS_SYNC.set('spider:site:%s:selectors' % site, selectors)
    print 'site = %s, selectors = %s' % (site, selectors)

    # for debugging purpose, write to file
    hash = hashlib.sha1(site).hexdigest()
    path = os.path.join(settings.STORE, hash[:2], hash)
    try:
        os.makedirs(path)
    except:
        pass
    path = os.path.join(path, 'clusters.json')
    with open(path, 'w') as f:
        f.write(
            json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
    print 'site = %s, file = %s' % (site, path)
Example #3
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)
    '''name=[]
    start=[]
    base=[]
    proc=[]
    urls=[]
    site_data= requests.get("http://1.7.151.12:8181/api/scraper_api/fetch_links.php").json()
    for sd in site_data:
        name ,start, base, proc=extract(sd)
        urls.append(base)'''

    for count in range(2, len(urls) + 1):

        print('[learner] clustering with %d urls' % count)

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]
        data = data[:count]

        # process data
        processor = processors.Processor(data)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # score
        clusters = processor.score(labels)

        with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f:
            f.write(
                json.dumps(clusters, indent=2,
                           ensure_ascii=False).encode('utf8'))
Example #4
0
def main(args):

    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load data
    data = [utils.load_data(path, id) for id, url in enumerate(urls)]
    random.shuffle(data)
    for page in data:
        random.shuffle(page['texts'])

    # process data
    processor = processors.Processor(data,
                                     tokenizer=tokenizers.GenericTokenizer,
                                     analyzer=analyzers.LongestAnalyzer)
    features = processor.extract()

    # clustering
    clusterer = clusterers.DBSCAN()
    labels = clusterer.cluster(features).labels_

    # prepare features
    continuous_features, discrete_features, labels = processor.prepare(labels)

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)
    print(features.shape)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels),
                                n_folds=4,
                                shuffle=False,
                                random_state=0)
    for train_index, test_index in rs:
        print('training size = %d, testing size = %d' %
              (len(train_index), len(test_index)))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print(clf.n_support_)

        print("training:")
        predicted = clf.predict(features[train_index])
        print(classification_report(labels[train_index], predicted))

        print("testing:")
        predicted = clf.predict(features[test_index])
        print(classification_report(labels[test_index], predicted))

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label]))

    return

    negatives = []
    positives = []
    for i in range(len(processor.texts)):
        if labels[i]:
            positives.append(processor.texts[i])
        else:
            negatives.append(processor.texts[i])

    stats(negatives, positives)

    return
    """
Example #5
0
def main(args):
    # path = utils.get_data_path(args.site[0])

    sites = ['safari']

    all_continuous_features = []
    all_discrete_features = []
    all_labels = []

    for site in sites:
        print('clustering %s ...' % site)

        path = utils.get_data_path(site)
        urls = utils.load_urls(path)

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]

        # process data
        processor = processors.Processor(data,
                                         tokenizer=tokenizers.GenericTokenizer,
                                         analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # prepare features
        continuous_features, discrete_features, labels, clusters = processor.prepare(
            labels)
        all_continuous_features += continuous_features
        all_discrete_features += discrete_features
        all_labels += labels

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(
        all_discrete_features).toarray()
    continuous_features = np.array(all_continuous_features)
    labels = np.array(all_labels).astype(np.float32)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = KFold(4).split(labels)
    # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        # print training size = %d, testing size = %d' % (len(train_index), len(test_index))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='balanced')
        clf.fit(features[train_index], labels[train_index])

        predicted = clf.predict(features[test_index])
        print(classification_report(labels[test_index], predicted))

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label]))

    ham = collections.defaultdict(dict)
    spam = collections.defaultdict(dict)

    pageId = 0
    for id, cluster in clusters.items():
        for page in cluster['pages'].values():
            content = ''
            for text in page['texts']:
                content += ' '.join(text['text'])

            if cluster['label'] is 1:
                ham[pageId][id] = content
            else:
                spam[pageId][id] = content
            pageId = pageId + 1

    with open(os.path.join(path, 'svm.json'), 'wb') as f:
        f.write(
            json.dumps({
                'ham': ham,
                'spam': spam
            },
                       indent=2,
                       ensure_ascii=False).encode('utf8'))

    return
Example #6
0
def main(args):
    # path = utils.get_data_path(args.site[0])

    sites = ['theverge', 'sina', 'qq', 'techcrunch', 'usatoday', 'npr', 'prothomalo']

    all_continuous_features = []
    all_discrete_features= []
    all_labels = []

    for site in sites:
        print 'clustering %s ...' % site

        path = utils.get_data_path(site)
        urls = utils.load_urls(path)

        # load data
        data = [utils.load_data(path, id) for id, url in enumerate(urls)]

        # process data
        processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        # clustering
        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        # prepare features
        continuous_features, discrete_features, labels = processor.prepare(labels)
        all_continuous_features += continuous_features
        all_discrete_features += discrete_features
        all_labels += labels


    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(all_discrete_features).toarray()
    continuous_features = np.array(all_continuous_features)
    labels = np.array(all_labels).astype(np.float32)

    features = np.hstack([continuous_features, discrete_features]).astype(np.float32)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    for train_index, test_index in rs:
        print 'training size = %d, testing size = %d' % (len(train_index), len(test_index))

        clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        predicted = clf.predict(features[test_index])
        print classification_report(labels[test_index], predicted)

        precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])

    return

    """
Example #7
0
def main(args):

    extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)),
                             'label.py')
    path = utils.get_data_path(args.site[0])
    urls = utils.load_urls(path)

    # load each JSON file from chaos.
    # Read each block of that file.
    # [P2] Sort the blocks by their size.
    # Also load the gold-text of that file.
    # If matching between gold-text and that element text is
    #   above a certain threshold, label that block as 1.
    # [P2] remove the matching part from gold-text.
    # Rewrite the blocks to another json file.

    # extract data from each url

    # load data
    pages = []
    domains = collections.defaultdict(lambda: 0)

    for id, url in enumerate(urls):
        if not url.strip():
            continue

        host = url.split('/', 3)[2]
        #if domains[host] > 2:
        #    continue
        domains[host] += 1
        print(host)

        page = utils.load_data(path, id)
        processor = processors.Processor([page],
                                         tokenizer=tokenizers.GenericTokenizer,
                                         analyzer=analyzers.LongestAnalyzer)
        features = processor.extract()

        clusterer = clusterers.DBSCAN()
        labels = clusterer.cluster(features).labels_

        clusters = collections.defaultdict(list)
        for text, label in zip(processor.texts, labels):
            clusters[int(label)].append(text)

        gold_text = utils.load_gold_text(path, id)
        gold_text = processor.tokenizer.tokenize(gold_text)

        max_score = 0
        best_label = None
        for label, texts in clusters.items():
            tokens = ''
            for text in texts:
                tokens += text['tokens']
            score = processor.analyzer.get_similarity(tokens, gold_text)
            if score > max_score:
                max_score = score
                best_label = label

        for text in clusters[best_label]:
            text['label'] = 1

        page_texts = []
        for label, texts in clusters.items():
            page_texts += texts
        random.shuffle(page_texts)
        pages.append(page_texts)

    #random.shuffle(pages)

    continuous_features = []
    discrete_features = []
    labels = []

    for page in pages:
        for text in page:
            text_length = len(text['tokens'])
            area = text['bound']['height'] * text['bound']['width']
            text_density = float(text_length) / float(area)

            # continuous_feature
            continuous_feature = []  #text_length, text_density]
            continuous_features.append(continuous_feature)

            # discrete features
            discrete_feature = dict()
            discrete_feature = dict(text['computed'].items())
            discrete_feature['path'] = ' > '.join(text['path'])
            """
            discrete_feature['selector'] = ' > '.join([
                '%s%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                    '.' + '.'.join(selector['classes']) if selector['classes'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_feature['class'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '.' + '.'.join(selector['classes'])
                    if selector['classes'] else '',
                ) for selector in text['selector']
            ])
            """
            discrete_feature['id'] = ' > '.join([
                '%s%s' % (
                    selector['name'],
                    '#' + selector['id'] if selector['id'] else '',
                )
                for selector in text['selector']
            ])
            """
            discrete_features.append(discrete_feature)

            # label
            labels.append(text['label'])

    vectorizer = DictVectorizer()
    discrete_features = vectorizer.fit_transform(discrete_features).toarray()
    continuous_features = np.array(continuous_features)
    labels = np.array(labels).astype(np.float32)

    # scale features
    features = preprocessing.scale(features)

    features = np.hstack([continuous_features,
                          discrete_features]).astype(np.float32)
    print(features.shape)

    precisions = []
    recalls = []
    f1scores = []
    supports = []

    # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0)
    rs = KFold(4).split(labels)
    for train_index, test_index in rs:
        print('training size = %d, testing size = %d' %
              (len(train_index), len(test_index)))

        clf = svm.SVC(verbose=False,
                      kernel='linear',
                      probability=False,
                      random_state=0,
                      cache_size=2000,
                      class_weight='auto')
        clf.fit(features[train_index], labels[train_index])

        print(clf.n_support_)
        """
        negatives = []
        for i in clf.support_[:clf.n_support_[0]]:
            negatives.append(all_texts[i])

        positives = []
        for i in clf.support_[clf.n_support_[0]:]:
            positives.append(all_texts[i])

        stats(negatives, positives)
        """

        print("training:")
        predicted = clf.predict(features[train_index])
        print(classification_report(labels[train_index], predicted))

        print("testing:")
        predicted = clf.predict(features[test_index])
        print(classification_report(labels[test_index], predicted))

        precision, recall, f1score, support = precision_recall_fscore_support(
            labels[test_index], predicted)

        precisions.append(precision)
        recalls.append(recall)
        f1scores.append(f1score)
        supports.append(support)

    precisions = np.mean(np.array(precisions), axis=0)
    recalls = np.mean(np.array(recalls), axis=0)
    f1scores = np.mean(np.array(f1scores), axis=0)
    supports = np.mean(np.array(supports), axis=0)

    for label in range(2):
        print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label],
                                  f1scores[label], supports[label]))

    return