def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) for count in range(2, len(urls) + 1): print '[learner] clustering with %d urls' % count # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write( json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def learn(site): # do we need to relearn? count = settings.REDIS_SYNC.hlen('spider:site:%s' % site) if settings.REDIS_SYNC.get('spider:site:%s:learned' % site) == count: return settings.REDIS_SYNC.set('spider:site:%s:learned' % site, count) print 'site = %s, count = %d' % (site, count) # load data from redis data = settings.REDIS_SYNC.hgetall('spider:site:%s' % site) data = [pickle.loads(zlib.decompress(data)) for data in data.values()] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) # selecting selectors selectors = [] for cluster in clusters: if cluster['confidence'] < settings.CONFIDENCE and cluster[ 'score'] < settings.SCORE: continue for selector in cluster['selectors'].values(): if selector and selector[-1]['name'] != 'a': selectors.append(selector) selectors = ','.join(utils.consolidate_selectors(selectors).keys()) settings.REDIS_SYNC.set('spider:site:%s:selectors' % site, selectors) print 'site = %s, selectors = %s' % (site, selectors) # for debugging purpose, write to file hash = hashlib.sha1(site).hexdigest() path = os.path.join(settings.STORE, hash[:2], hash) try: os.makedirs(path) except: pass path = os.path.join(path, 'clusters.json') with open(path, 'w') as f: f.write( json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8')) print 'site = %s, file = %s' % (site, path)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) '''name=[] start=[] base=[] proc=[] urls=[] site_data= requests.get("http://1.7.151.12:8181/api/scraper_api/fetch_links.php").json() for sd in site_data: name ,start, base, proc=extract(sd) urls.append(base)''' for count in range(2, len(urls) + 1): print('[learner] clustering with %d urls' % count) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write( json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: random.shuffle(page['texts']) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) # scale features features = preprocessing.scale(features) print(features.shape) precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print('training size = %d, testing size = %d' % (len(train_index), len(test_index))) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print(clf.n_support_) print("training:") predicted = clf.predict(features[train_index]) print(classification_report(labels[train_index], predicted)) print("testing:") predicted = clf.predict(features[test_index]) print(classification_report(labels[test_index], predicted)) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])) return negatives = [] positives = [] for i in range(len(processor.texts)): if labels[i]: positives.append(processor.texts[i]) else: negatives.append(processor.texts[i]) stats(negatives, positives) return """
def main(args): # path = utils.get_data_path(args.site[0]) sites = ['safari'] all_continuous_features = [] all_discrete_features = [] all_labels = [] for site in sites: print('clustering %s ...' % site) path = utils.get_data_path(site) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels, clusters = processor.prepare( labels) all_continuous_features += continuous_features all_discrete_features += discrete_features all_labels += labels vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform( all_discrete_features).toarray() continuous_features = np.array(all_continuous_features) labels = np.array(all_labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) precisions = [] recalls = [] f1scores = [] supports = [] rs = KFold(4).split(labels) # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: # print training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='balanced') clf.fit(features[train_index], labels[train_index]) predicted = clf.predict(features[test_index]) print(classification_report(labels[test_index], predicted)) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])) ham = collections.defaultdict(dict) spam = collections.defaultdict(dict) pageId = 0 for id, cluster in clusters.items(): for page in cluster['pages'].values(): content = '' for text in page['texts']: content += ' '.join(text['text']) if cluster['label'] is 1: ham[pageId][id] = content else: spam[pageId][id] = content pageId = pageId + 1 with open(os.path.join(path, 'svm.json'), 'wb') as f: f.write( json.dumps({ 'ham': ham, 'spam': spam }, indent=2, ensure_ascii=False).encode('utf8')) return
def main(args): # path = utils.get_data_path(args.site[0]) sites = ['theverge', 'sina', 'qq', 'techcrunch', 'usatoday', 'npr', 'prothomalo'] all_continuous_features = [] all_discrete_features= [] all_labels = [] for site in sites: print 'clustering %s ...' % site path = utils.get_data_path(site) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) all_continuous_features += continuous_features all_discrete_features += discrete_features all_labels += labels vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(all_discrete_features).toarray() continuous_features = np.array(all_continuous_features) labels = np.array(all_labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print 'training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label]) return """
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'label.py') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load each JSON file from chaos. # Read each block of that file. # [P2] Sort the blocks by their size. # Also load the gold-text of that file. # If matching between gold-text and that element text is # above a certain threshold, label that block as 1. # [P2] remove the matching part from gold-text. # Rewrite the blocks to another json file. # extract data from each url # load data pages = [] domains = collections.defaultdict(lambda: 0) for id, url in enumerate(urls): if not url.strip(): continue host = url.split('/', 3)[2] #if domains[host] > 2: # continue domains[host] += 1 print(host) page = utils.load_data(path, id) processor = processors.Processor([page], tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ clusters = collections.defaultdict(list) for text, label in zip(processor.texts, labels): clusters[int(label)].append(text) gold_text = utils.load_gold_text(path, id) gold_text = processor.tokenizer.tokenize(gold_text) max_score = 0 best_label = None for label, texts in clusters.items(): tokens = '' for text in texts: tokens += text['tokens'] score = processor.analyzer.get_similarity(tokens, gold_text) if score > max_score: max_score = score best_label = label for text in clusters[best_label]: text['label'] = 1 page_texts = [] for label, texts in clusters.items(): page_texts += texts random.shuffle(page_texts) pages.append(page_texts) #random.shuffle(pages) continuous_features = [] discrete_features = [] labels = [] for page in pages: for text in page: text_length = len(text['tokens']) area = text['bound']['height'] * text['bound']['width'] text_density = float(text_length) / float(area) # continuous_feature continuous_feature = [] #text_length, text_density] continuous_features.append(continuous_feature) # discrete features discrete_feature = dict() discrete_feature = dict(text['computed'].items()) discrete_feature['path'] = ' > '.join(text['path']) """ discrete_feature['selector'] = ' > '.join([ '%s%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['class'] = ' > '.join([ '%s%s' % ( selector['name'], '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['id'] = ' > '.join([ '%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', ) for selector in text['selector'] ]) """ discrete_features.append(discrete_feature) # label labels.append(text['label']) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) # scale features features = preprocessing.scale(features) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) print(features.shape) precisions = [] recalls = [] f1scores = [] supports = [] # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) rs = KFold(4).split(labels) for train_index, test_index in rs: print('training size = %d, testing size = %d' % (len(train_index), len(test_index))) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print(clf.n_support_) """ negatives = [] for i in clf.support_[:clf.n_support_[0]]: negatives.append(all_texts[i]) positives = [] for i in clf.support_[clf.n_support_[0]:]: positives.append(all_texts[i]) stats(negatives, positives) """ print("training:") predicted = clf.predict(features[train_index]) print(classification_report(labels[train_index], predicted)) print("testing:") predicted = clf.predict(features[test_index]) print(classification_report(labels[test_index], predicted)) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])) return