def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) for count in range(2, len(urls) + 1): print '[learner] clustering with %d urls' % count # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write( json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'extractor.coffee') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url for id, url in enumerate(urls): url = url.strip() if not url: continue # skip already extracted if os.path.exists(os.path.join(path, '%03d.json' % id)): continue print '[extractor] #%03d: %s' % (id, url) subprocess.call( 'cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1' % { 'path': path, 'extractor': extractor, 'url': url, 'label': id, }, shell=True)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url data = [] for id, url in enumerate(urls): url = url.strip() if not url: continue print '[diffbot] #%03d: %s' % (id, url) response = urllib2.urlopen( 'http://www.diffbot.com/api/article?' + urllib.urlencode({ 'url': url, 'token': '4bc6e407da88dd8723c70a5297cdf7fb', 'timeout': '60000', })) data.append(json.loads(response.read())) with open(os.path.join(path, 'diffbot.json'), 'w') as f: f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) for count in range(2, len(urls) + 1): print '[learner] clustering with %d urls' % count # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write(json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url data = [] for id, url in enumerate(urls): url = url.strip() if not url: continue print '[diffbot] #%03d: %s' % (id, url) response = urllib2.urlopen('http://www.diffbot.com/api/article?' + urllib.urlencode({ 'url': url, 'token': '4bc6e407da88dd8723c70a5297cdf7fb', 'timeout': '60000', })) data.append(json.loads(response.read())) with open(os.path.join(path, 'diffbot.json'), 'w') as f: f.write(json.dumps(data, indent=2, ensure_ascii=False).encode('utf8'))
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'extractor.coffee') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # extract data from each url for id, url in enumerate(urls): url = url.strip() if not url: continue # skip already extracted if os.path.exists(os.path.join(path, '%03d.json' % id)): continue print '[extractor] #%03d: %s' % (id, url) subprocess.call('cd "%(path)s" && phantomjs "%(extractor)s" "%(url)s" "%(label)03d" > "%(label)03d.log" 2>&1' % { 'path': path, 'extractor': extractor, 'url': url, 'label': id, }, shell=True)
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) '''name=[] start=[] base=[] proc=[] urls=[] site_data= requests.get("http://1.7.151.12:8181/api/scraper_api/fetch_links.php").json() for sd in site_data: name ,start, base, proc=extract(sd) urls.append(base)''' for count in range(2, len(urls) + 1): print('[learner] clustering with %d urls' % count) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] data = data[:count] # process data processor = processors.Processor(data) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # score clusters = processor.score(labels) with open(os.path.join(path, 'clusters.%03d.json' % count), 'w') as f: f.write( json.dumps(clusters, indent=2, ensure_ascii=False).encode('utf8'))
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: random.shuffle(page['texts']) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) # scale features features = preprocessing.scale(features) print(features.shape) precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print('training size = %d, testing size = %d' % (len(train_index), len(test_index))) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print(clf.n_support_) print("training:") predicted = clf.predict(features[train_index]) print(classification_report(labels[train_index], predicted)) print("testing:") predicted = clf.predict(features[test_index]) print(classification_report(labels[test_index], predicted)) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])) return negatives = [] positives = [] for i in range(len(processor.texts)): if labels[i]: positives.append(processor.texts[i]) else: negatives.append(processor.texts[i]) stats(negatives, positives) return """
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'label.py') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load each JSON file from chaos. # Read each block of that file. # [P2] Sort the blocks by their size. # Also load the gold-text of that file. # If matching between gold-text and that element text is # above a certain threshold, label that block as 1. # [P2] remove the matching part from gold-text. # Rewrite the blocks to another json file. # extract data from each url # load data pages = [] domains = collections.defaultdict(lambda: 0) for id, url in enumerate(urls): if not url.strip(): continue host = url.split('/', 3)[2] #if domains[host] > 2: # continue domains[host] += 1 print host page = utils.load_data(path, id) processor = processors.Processor([page], tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ clusters = collections.defaultdict(list) for text, label in zip(processor.texts, labels): clusters[int(label)].append(text) gold_text = utils.load_gold_text(path, id) gold_text = processor.tokenizer.tokenize(gold_text) max_score = 0 best_label = None for label, texts in clusters.iteritems(): tokens = '' for text in texts: tokens += text['tokens'] score = processor.analyzer.get_similarity(tokens, gold_text) if score > max_score: max_score = score best_label = label for text in clusters[best_label]: text['label'] = 1 page_texts = [] for label, texts in clusters.iteritems(): page_texts += texts random.shuffle(page_texts) pages.append(page_texts) #random.shuffle(pages) continuous_features = [] discrete_features = [] labels = [] for page in pages: for text in page: text_length = len(text['tokens']) area = text['bound']['height'] * text['bound']['width'] text_density = float(text_length) / float(area) # continuous_feature continuous_feature = [] #text_length, text_density] continuous_features.append(continuous_feature) # discrete features discrete_feature = dict() discrete_feature = dict(text['computed'].items()) discrete_feature['path'] = ' > '.join(text['path']) """ discrete_feature['selector'] = ' > '.join([ '%s%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['class'] = ' > '.join([ '%s%s' % ( selector['name'], '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['id'] = ' > '.join([ '%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', ) for selector in text['selector'] ]) """ discrete_features.append(discrete_feature) # label labels.append(text['label']) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) # scale features features = preprocessing.scale(features) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) print features.shape precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print 'training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print clf.n_support_ """ negatives = [] for i in clf.support_[:clf.n_support_[0]]: negatives.append(all_texts[i]) positives = [] for i in clf.support_[clf.n_support_[0]:]: positives.append(all_texts[i]) stats(negatives, positives) """ print "training:" predicted = clf.predict(features[train_index]) print classification_report(labels[train_index], predicted) print "testing:" predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label]) return
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: random.shuffle(page["texts"]) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) # scale features features = preprocessing.scale(features) print features.shape precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print "training size = %d, testing size = %d" % (len(train_index), len(test_index)) clf = svm.SVC( verbose=False, kernel="linear", probability=False, random_state=0, cache_size=2000, class_weight="auto" ) clf.fit(features[train_index], labels[train_index]) print clf.n_support_ print "training:" predicted = clf.predict(features[train_index]) print classification_report(labels[train_index], predicted) print "testing:" predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print "%f\t%f\t%f\t%f" % (precisions[label], recalls[label], f1scores[label], supports[label]) return negatives = [] positives = [] for i in range(len(processor.texts)): if labels[i]: positives.append(processor.texts[i]) else: negatives.append(processor.texts[i]) stats(negatives, positives) return """
def main(args): # path = utils.get_data_path(args.site[0]) sites = ['safari'] all_continuous_features = [] all_discrete_features = [] all_labels = [] for site in sites: print('clustering %s ...' % site) path = utils.get_data_path(site) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels, clusters = processor.prepare( labels) all_continuous_features += continuous_features all_discrete_features += discrete_features all_labels += labels vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform( all_discrete_features).toarray() continuous_features = np.array(all_continuous_features) labels = np.array(all_labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) precisions = [] recalls = [] f1scores = [] supports = [] rs = KFold(4).split(labels) # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: # print training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='balanced') clf.fit(features[train_index], labels[train_index]) predicted = clf.predict(features[test_index]) print(classification_report(labels[test_index], predicted)) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])) ham = collections.defaultdict(dict) spam = collections.defaultdict(dict) pageId = 0 for id, cluster in clusters.items(): for page in cluster['pages'].values(): content = '' for text in page['texts']: content += ' '.join(text['text']) if cluster['label'] is 1: ham[pageId][id] = content else: spam[pageId][id] = content pageId = pageId + 1 with open(os.path.join(path, 'svm.json'), 'wb') as f: f.write( json.dumps({ 'ham': ham, 'spam': spam }, indent=2, ensure_ascii=False).encode('utf8')) return
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) #print path count=0 # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: # print count #count+=1 random.shuffle(page['texts']) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() #print len(features) # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path) res=[texts1,labels] # print len(texts1) lab=[] urlss=[] for k in texts1: lab.append(k.encode('ascii','ignore')) for l in urls: urlss.append(l.encode('ascii','ignore')) # decode the uncodes into string variable with open("rohit.csv","w") as fp: writer = csv.writer(fp) for row in zip(urls, lab, labels,classes): writer.writerow(row) input("enter data ") classes=[] with open("rohit.csv","r") as fp: reader = csv.reader(fp) for row in reader: classes += [row[3]] # Label the dataset and give them classes to which they belong.Classes are in 4th column for i in xrange(1,len(classes)): if classes[i] == 0: classes[i]= cluster_labels[i] vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() discrete_features.resize(len(discrete_features), 10000) continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) #print len(discrete_features[2]) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) #print features # scale features features = preprocessing.scale(features) #preprocess the features rf=RandomForestClassifier(n_estimators=300) rf.fit(features,classes) #make a randomforest model and fit into them features and classes filename = '/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl' _ = joblib.dump(rf, filename, compress=9) rf = joblib.load(filename) #dump the model file into the particular directory precisions = [] recalls = [] f1scores = [] supports = [] return
def main(args): # path = utils.get_data_path(args.site[0]) sites = ['theverge', 'sina', 'qq', 'techcrunch', 'usatoday', 'npr', 'prothomalo'] all_continuous_features = [] all_discrete_features= [] all_labels = [] for site in sites: print 'clustering %s ...' % site path = utils.get_data_path(site) urls = utils.load_urls(path) # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, labels = processor.prepare(labels) all_continuous_features += continuous_features all_discrete_features += discrete_features all_labels += labels vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(all_discrete_features).toarray() continuous_features = np.array(all_continuous_features) labels = np.array(all_labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) precisions = [] recalls = [] f1scores = [] supports = [] rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) for train_index, test_index in rs: print 'training size = %d, testing size = %d' % (len(train_index), len(test_index)) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) predicted = clf.predict(features[test_index]) print classification_report(labels[test_index], predicted) precision, recall, f1score, support = precision_recall_fscore_support(labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print '%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label]) return """
def main(args): extractor = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'label.py') path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) # load each JSON file from chaos. # Read each block of that file. # [P2] Sort the blocks by their size. # Also load the gold-text of that file. # If matching between gold-text and that element text is # above a certain threshold, label that block as 1. # [P2] remove the matching part from gold-text. # Rewrite the blocks to another json file. # extract data from each url # load data pages = [] domains = collections.defaultdict(lambda: 0) for id, url in enumerate(urls): if not url.strip(): continue host = url.split('/', 3)[2] #if domains[host] > 2: # continue domains[host] += 1 print(host) page = utils.load_data(path, id) processor = processors.Processor([page], tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ clusters = collections.defaultdict(list) for text, label in zip(processor.texts, labels): clusters[int(label)].append(text) gold_text = utils.load_gold_text(path, id) gold_text = processor.tokenizer.tokenize(gold_text) max_score = 0 best_label = None for label, texts in clusters.items(): tokens = '' for text in texts: tokens += text['tokens'] score = processor.analyzer.get_similarity(tokens, gold_text) if score > max_score: max_score = score best_label = label for text in clusters[best_label]: text['label'] = 1 page_texts = [] for label, texts in clusters.items(): page_texts += texts random.shuffle(page_texts) pages.append(page_texts) #random.shuffle(pages) continuous_features = [] discrete_features = [] labels = [] for page in pages: for text in page: text_length = len(text['tokens']) area = text['bound']['height'] * text['bound']['width'] text_density = float(text_length) / float(area) # continuous_feature continuous_feature = [] #text_length, text_density] continuous_features.append(continuous_feature) # discrete features discrete_feature = dict() discrete_feature = dict(text['computed'].items()) discrete_feature['path'] = ' > '.join(text['path']) """ discrete_feature['selector'] = ' > '.join([ '%s%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['class'] = ' > '.join([ '%s%s' % ( selector['name'], '.' + '.'.join(selector['classes']) if selector['classes'] else '', ) for selector in text['selector'] ]) """ discrete_feature['id'] = ' > '.join([ '%s%s' % ( selector['name'], '#' + selector['id'] if selector['id'] else '', ) for selector in text['selector'] ]) """ discrete_features.append(discrete_feature) # label labels.append(text['label']) vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) # scale features features = preprocessing.scale(features) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) print(features.shape) precisions = [] recalls = [] f1scores = [] supports = [] # rs = cross_validation.KFold(len(labels), n_folds=4, shuffle=False, random_state=0) rs = KFold(4).split(labels) for train_index, test_index in rs: print('training size = %d, testing size = %d' % (len(train_index), len(test_index))) clf = svm.SVC(verbose=False, kernel='linear', probability=False, random_state=0, cache_size=2000, class_weight='auto') clf.fit(features[train_index], labels[train_index]) print(clf.n_support_) """ negatives = [] for i in clf.support_[:clf.n_support_[0]]: negatives.append(all_texts[i]) positives = [] for i in clf.support_[clf.n_support_[0]:]: positives.append(all_texts[i]) stats(negatives, positives) """ print("training:") predicted = clf.predict(features[train_index]) print(classification_report(labels[train_index], predicted)) print("testing:") predicted = clf.predict(features[test_index]) print(classification_report(labels[test_index], predicted)) precision, recall, f1score, support = precision_recall_fscore_support( labels[test_index], predicted) precisions.append(precision) recalls.append(recall) f1scores.append(f1score) supports.append(support) precisions = np.mean(np.array(precisions), axis=0) recalls = np.mean(np.array(recalls), axis=0) f1scores = np.mean(np.array(f1scores), axis=0) supports = np.mean(np.array(supports), axis=0) for label in range(2): print('%f\t%f\t%f\t%f' % (precisions[label], recalls[label], f1scores[label], supports[label])) return
def main(args): path = utils.get_data_path(args.site[0]) urls = utils.load_urls(path) #print path count=0 # load data data = [utils.load_data(path, id) for id, url in enumerate(urls)] random.shuffle(data) for page in data: # print count #count+=1 random.shuffle(page['texts']) # process data processor = processors.Processor(data, tokenizer=tokenizers.GenericTokenizer, analyzer=analyzers.LongestAnalyzer) features = processor.extract() #print len(features) # clustering clusterer = clusterers.DBSCAN() labels = clusterer.cluster(features).labels_ # prepare features continuous_features, discrete_features, cluster_labels , texts1, urls , classes = processor.prepare(labels,path) #It retrieves all the dataset which gets after getting clustered and store them in various lists lab=[] urlss=[] for k in texts1: lab.append(k.encode('ascii','ignore')) #It decodes the unicode into text vectorizer = DictVectorizer() discrete_features = vectorizer.fit_transform(discrete_features).toarray() #This is the feature extraction part which I have mentioned in my doc discrete_features.resize(len(discrete_features), 10000) #resize the discreet_features array to a uniform size so that in further using it model and test data set have same length features array continuous_features = np.array(continuous_features) labels = np.array(labels).astype(np.float32) features = np.hstack([continuous_features, discrete_features]).astype(np.float32) features = preprocessing.scale(features) #This is the normalization process where features are preprocessed to a scale im1=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/dazedandconfused/000.png") im2=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/fruitsofotherhands/000.png") im3=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/rohitanurag/000.png") im4=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/thegirlwhoreadtoomuch/000.png") im5=Image.open("/home/test/nutch/runtime/local/phantomjslearning/data/timcotson/000.png") imtest=Image.open(path+"/000.png") result1=equal(imtest,im1) result2=equal(imtest,im2) result3=equal(imtest,im3) result4=equal(imtest,im4) result5=equal(imtest,im5) choose=0 testresult=result1 if result1 <= testresult: choose=1 testresult=result1 if result2 <= testresult: choose=2 testresult=result2 if result3 <= testresult: choose=3 testresult=result3 if result4 <= testresult: choose=4 testresult=result4 if result5 <= testresult: choose=5 testresult=result5 if choose == 1: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfdazedandconfused.joblib.pkl" if choose == 2: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rffruitsofother.joblib.pkl" if choose == 3: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfrohitanurag.joblib.pkl" if choose == 4: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rfthegirlwhoused.joblib.pkl" if choose == 5: usemodel="/home/test/nutch/runtime/local/phantomjslearning/classlibraries/rftimscoton.joblib.pkl" #Here we get the predicted model which we use to predict classes like title , date,paragraphs of blogs usemodel = "/home/test/nutch/runtime/local/phantomjslearning/classlibraries/ivfhaveababy.joblib.pkl" rf = joblib.load(usemodel) #loads the model and then use it for prediction predicted = rf.predict(features) print usemodel for i in xrange(1,len(predicted)): print lab[i] print "*********" print predicted[i] print "**********" return