def handle(self, *args, **options): super(Command, self).handle(*args, **options) for database in self.selected_dbs: print "Processing database " + database cursor = connections[database].cursor() print "Grabbing newsitems data" queryset = Newsitem.objects\ .using(database)\ .exclude(cat1__isnull=True)\ .only('cat1') newsitems_cat1 = {} self.pbar_setup(maxval=queryset.count()) for newsitem in queryset_iterator(queryset, chunksize=1000): if newsitem.cat1 == None: pdb.set_trace() pass newsitems_cat1[newsitem.id] = newsitem.cat1 self.pbar_increment() self.pbar_destroy() print "Grabbing comments and their texts, then dumping them to target folder ..." queryset = Comment.objects\ .using(database)\ .filter(newsitem_id__in=newsitems_cat1.keys())\ .only('date','newsitem_id','parent_id','text') self.pbar_setup(maxval=queryset.count()) for comment in queryset_iterator(queryset, chunksize=1000): if not comment.text or not comment.text.text: continue cat1 = newsitems_cat1[comment.newsitem_id] if cat1 == None: pdb.set_trace() pass filename = "{}_{}_{}.txt".format( cat1, comment.date.date().isoformat(), comment.id) target_filename = os.path.join(options['TARGET_DIR_ALL'], filename) fileobj = codecs.open(target_filename, 'w', 'utf-8') fileobj.write(comment.text.text) fileobj.close() if comment.parent_id == None: target_filename = os.path.join(options['TARGET_DIR_NEWS'], filename) fileobj = codecs.open(target_filename, 'w', 'utf-8') fileobj.write(comment.text.text) fileobj.close() self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) images_re = re.compile("!\[(.+)\]\((.+)\)") socials_re = re.compile("Facebook Twitter Pinterest") for database in self.selected_dbs: print "Processing database " + database cursor = connections[database].cursor() print "Grabbing newsitems and their texts, then dumping them to target folder ..." queryset = Newsitem.objects\ .using(database)\ .exclude(cat1__isnull=True)\ .only('date','cat1','title','text')\ self.pbar_setup(maxval=queryset.count()) for newsitem in queryset_iterator(queryset, chunksize=1000): if not newsitem.text or not newsitem.text.text: continue filename = "{}_{}_news_{}.txt".format( newsitem.cat1, newsitem.date.date().isoformat(), newsitem.id) target_filename = os.path.join(options['TARGET_DIR'], filename) fileobj = codecs.open(target_filename, 'w', 'utf-8') fileobj.write(newsitem.title + ". \n" + newsitem.text.text) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) citations_re = re.compile("\".*\"") for database in self.selected_dbs: print "Processing database " + database cursor = connections[database].cursor() print "Grabbing newsitems and calculating..." queryset = Text.objects.using(database).filter( newsitem__isnull=False) self.pbar_setup(maxval=queryset.count()) for text in queryset_iterator(queryset, chunksize=1000): qty_citations = len(citations_re.findall(text.text)) if qty_citations > 0: query = """ UPDATE newsitem SET qty_citations = %s WHERE text_id = %s """ cursor.execute(query, [qty_citations, text.id]) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) print "Loading classifier" classifier_path = os.path.join( settings.BASE_DIR, "lda_stats/classifiers/naive_bayes.classifier.cpickle") nb = cPickle.load(open(classifier_path, 'rb')) filters = {} if options['only_newsitems']: filters['newsitem__isnull'] = False elif options['only_comments']: filters['comment__isnull'] = False for database in self.selected_dbs: print "Processing database " + database algorithm_naive_bayes, created = Algorithm.objects.using(database)\ .get_or_create(name="naive_bayes") print "Removing previous results" query = """ DELETE FROM result WHERE algorithm_id IN ( SELECT id FROM algorithm WHERE name = 'naive_bayes' )""" cursor = connections[database].cursor() cursor.execute(query) print "Querying database" queryset = Text.objects.using(database).filter(**filters) results = [] print "Calculating..." self.pbar_setup(maxval=queryset.count()) for text in queryset_iterator(queryset, chunksize=2000): estimate = nb.predict([ utils.TFIDF_to_list(utils.TFIDF(utils.tokenize(text.text))) ]) results.append( Result(algorithm=algorithm_naive_bayes, text=text, value=str(estimate[0]))) self.pbar_increment() if len(results) > 100000: print "\nSaving partial results..." Result.objects.using(database).bulk_create(results) results = [] self.pbar_destroy() print "Saving results" Result.objects.using(database).bulk_create(results) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) queryset = Newsitem.objects.filter(topics__isnull=True) self.pbar_setup(maxval=queryset.count()) done = 0 for newsitem in queryset_iterator(queryset): try: resp = requests.get(newsitem.url) soup = BeautifulSoup(resp.text, "html.parser") for meta in soup('meta'): attrs = meta.__dict__['attrs'] if 'property' in attrs and attrs[ 'property'] == 'article:tag': with transaction.atomic(): for topic_name in attrs['content'].split(','): topic, created = Topic.objects.get_or_create( name=topic_name) newsitem.topics.add(topic) newsitem.save() break except Exception as e: print "Error processing URL: {0} -=- message: {1}".format( newsitem.url, e) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def migrate(apps, schema_editor, direction): pbar_widgets = [ SimpleProgress(), ' ', Percentage(), ' ', Bar(), ' ', Timer(), ' ', AdaptiveETA() ] for model in ['Comment', 'Newsitem']: queryset = globals()[model].objects.filter(text__isnull=True) if queryset.count() == 0: continue print "Processing " + model pbar = ProgressBar(widgets=pbar_widgets, maxval=queryset.count()) done = 0 pbar.start() tokenizer = RegexpTokenizer(r'\w+') for item in queryset_iterator(queryset, chunksize=20): if item.content: if direction == "forward": item.text = Text.objects.create( text=item.content, wordcount=len(tokenizer.tokenize(item.content))) elif direction == "backwards": item.content = item.text.text item.save() done += 1 pbar.update(done) pbar.finish()
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) self.html_parser = HTMLParser.HTMLParser() for db_name in self.selected_dbs: print "Calculating wordcount for database " + db_name queryset = Text.objects.using(db_name).all() with transaction.atomic(): self.pbar_setup(maxval=queryset.count()) for text in queryset_iterator(queryset, chunksize=10000): wordcount = len(self.clean_text(text.text).split()) text.wordcount = wordcount text.save() self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def generate_author_data(self, database): self.author_data = {} for model in [Newsitem, Comment]: if model == Newsitem: model_name = "Newsitem" else: model_name = "Comment" self.stdout.write("Processing model " + model_name) queryset = model.objects.using(database)\ .filter(text__isnull=False)\ .select_related('text__text') if model == Comment: queryset = queryset.select_related('newsitem__idauthor', 'parent__authorid') self.pbar_setup(maxval=queryset.count()) queryset_iter = queryset_iterator(queryset, chunksize=100) pool = Pool() #pool.map_async(work, queryset_iter, callback=self.pbar_increment()) #for item, tokenized in pool.imap_unordered(tokenize, queryset_iter, 100): for item in queryset_iter: item, tokenized = tokenize(item) self.work(item, tokenized) self.pbar_increment() self.pbar_destroy()
def handle(self, *args, **options): super(Command, self).handle(*args, **options) filters = {} if options['only_newsitems']: filters['newsitem__isnull'] = False elif options['only_comments']: filters['comment__isnull'] = False for database in self.selected_dbs: print "Processing database " + database algorithm_nlp_rake, _ = Algorithm.objects.using(database)\ .get_or_create(name="nlp_rake") queryset = Text.objects.using(database).filter(**filters) print "Removing previous results" query = """ DELETE FROM keyword WHERE algorithm_id = %s """ cursor = connections[database].cursor() cursor.execute(query, [algorithm_nlp_rake.id]) queryset_iter = queryset_iterator(queryset, chunksize=1000) print "Calculating..." self.pbar_setup(maxval=queryset.count()) results = [] pool = Pool() bulk_create_process = None for text in queryset_iter: keywords = work(text) for keyword, score in keywords: results.append( Keyword( algorithm=algorithm_nlp_rake, text=text, keyword=keyword, score=score, )) if len(results) > 250: Keyword.objects.using(database).bulk_create(results) results = [] self.pbar_increment() self.pbar_destroy() if len(results) > 0: print "Saving last results..." Keyword.objects.using(database).bulk_create(results) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def lda(self, database, table="Comment", date_to=None, date_from=None, num_topics=10, alpha=0.5, beta=0.5, iteration_count=25, smart_init=False, phrase=False): model = self.str_to_class(table) items = model.objects.using(database).exclude(text__isnull=True)\ .filter(date__gte=date_from, date__lte=date_to)\ .only('text') if items.count() == 0: return [] contents = [i.text.text for i in items] all_docs = [] if phrase: for i in queryset_iterator(items, chunksize=100): rake = Rake('stopwords.txt') keywords = rake.run(i.text.text) all_docs.append([k[0] for k in keywords if " " in k[0]]) else: for i in queryset_iterator(items, chunksize=100): tokenized = nltk.word_tokenize( i.text.text.encode("ascii", "ignore")) all_docs.append(tokenized) stopwords = Stopwords('stopwords.txt') voca = Vocabulary(stopwords, excluds_stopwords=True) docs = [voca.doc_to_ids(doc) for doc in all_docs] lda = LDA(num_topics, alpha, beta, docs, voca.size(), smart_init) topics = lda_learning(lda, iteration_count, voca) return topics
def check_items(self): queryset = Newsitem.objects.filter(Q(lead="")| Q(text__isnull=True)| Q(title="")| Q(date__isnull=True)| Q(idauthor="")) for newsitem in queryset_iterator(queryset, chunksize=100): self.stdout.write("Processing URL "+newsitem.url) soup = BeautifulSoup(requests.get(newsitem.url).text, "html.parser") self.update_news_info(newsitem, soup) newsitem.save()
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) if len(self.selected_dbs) > 1: self.stdout.write(self.style.ERROR('You must choose a specific DB for this command')) return database = self.selected_dbs[0] current_text_id = 560000 for model in ['Newsitem']: queryset = globals()[model].objects.using(database).filter(text__isnull=True) if queryset.count() == 0: continue text_list = [] item_list = [] print "Processing " + model self.pbar_setup(maxval=queryset.count()) tokenizer = RegexpTokenizer(r'\w+') for item in queryset_iterator(queryset, chunksize=1000): if item.content: current_text = Text( id=current_text_id, text=item.content, wordcount=len(tokenizer.tokenize(item.content))) text_list.append(current_text) item.text = current_text item_list.append(item) current_text_id += 1 self.pbar_increment() if len(text_list) >= 2000: Text.objects.using(database).bulk_create(text_list) with transaction.atomic(database): for (item, text) in itertools.izip(item_list, text_list): item.text = text item.save() item_list = [] text_list = [] self.pbar_destroy() print "Saving final results..." Text.objects.using(database).bulk_create(text_list) with transaction.atomic(database): for (item, text) in itertools.izip(item_list, text_list): item.text = text item.save() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) images_re = re.compile("!\[(.+)\]\((.+)\)\n\n.+Photograph: .+\n\n") video_re = re.compile(u' – video[ \w\*]*') for database in self.selected_dbs: print "Processing database " + database cursor = connections[database].cursor() print "Grabbing newsitems and calculating..." queryset = Text.objects.using(database).filter( newsitem__isnull=False) self.pbar_setup(maxval=queryset.count()) for text in queryset_iterator(queryset, chunksize=1000): new_text = "" subn = images_re.subn("", text.text) qty_images = subn[1] qty_videos = 0 for line in subn[0].split("\n"): if line.lower().strip() == "read more": continue elif line.lower().strip() == "facebook twitter pinterest": continue elif video_re.search(line): qty_videos += 1 else: new_text += line + "\n" text.text = new_text text.save() if qty_images > 0 or qty_videos > 0: query = """ UPDATE newsitem SET qty_images = %s, qty_videos = %s WHERE text_id = %s """ cursor.execute(query, [qty_images, qty_videos, text.id]) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): queryset = Newsitem.objects self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset.all(), chunksize=100) for newsitem in newsitems: newsitem.cat1 = random.randint(1, 5) newsitem.save() self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) for database in self.selected_dbs: print "Processing database " + database pool = Pool() #print "Wordcounting newsitems" #self.wordcount = {} # #queryset = Newsitem.objects.using(database).select_related('text')\ # .filter(text__isnull=False) #queryset_iter = queryset_iterator(queryset, chunksize=25) # #self.pbar_setup(maxval=queryset.count()) #for wordcount in pool.imap_unordered(count_words, queryset_iter): # self.merge_wordcount(wordcount) # self.pbar_increment() #self.pbar_destroy() # #output = codecs.open('wordcount_newsitems.csv', 'w', 'utf-8') #output.write("word,count\n") # #for word, count in self.wordcount.items(): # output.write("{},{}\n".format(word,count)) print "Wordcounting comments" self.wordcount = {} queryset = Comment.objects.using(database).select_related('text')\ .filter(text__isnull=False) queryset_iter = queryset_iterator(queryset, chunksize=100) self.pbar_setup(maxval=queryset.count()) for wordcount in pool.imap_unordered(count_words, queryset_iter): self.merge_wordcount(wordcount) self.pbar_increment() self.pbar_destroy() output = codecs.open('wordcount_comments.csv', 'w', 'utf-8') output.write("word,count\n") for word, count in self.wordcount.items(): output.write(u"{},{}\n".format(word, count)) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) queryset = Newsitem.objects.all() self.pbar_setup(maxval=queryset.count()) for newsitem in queryset_iterator(queryset, chunksize=10): shortId = newsitem.url.split('/')[-1] current_page = 1 while True: resp = requests.get( "https://api.nextgen.guardianapps.co.uk/discussion/p/{0}.json" .format(shortId), { 'page': current_page, 'orderBy': 'oldest', 'pageSize': 100, 'displayThreaded': 'true', 'maxResponses': 1000000 }) if resp.status_code == 404: break try: dejson = resp.json() comment_soup = BeautifulSoup(dejson['commentsHtml'], "html.parser") except MemoryError: print "MemoryError when dealing with URL " + newsitem.url with transaction.atomic(): self.process_comment_ul(comment_soup.ul, newsitem, parent=None) current_page += 1 if current_page > dejson['lastPage']: break self.pbar_increment() self.pbar_destroy()
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) filters = {} if options['only_newsitems']: filters['newsitem__isnull'] = False elif options['only_comments']: filters['comment__isnull'] = False for database in self.selected_dbs: print "Processing database " + database algorithm_tf_naive_bayes, created = Algorithm.objects.using(database)\ .get_or_create(name="tf_naive_bayes") print "Removing previous results" query = """ DELETE FROM result WHERE algorithm_id IN ( SELECT id FROM algorithm WHERE name = 'tf_naive_bayes' )""" cursor = connections[database].cursor() cursor.execute(query) print "Querying database" queryset = Text.objects.using(database).filter(**filters) results = [] self.pbar_setup(maxval=queryset.count()) queryset_iter = queryset_iterator(queryset, chunksize=2000) print "Calculating..." bulk_create_process = None pool = Pool() for text, result in pool.imap_unordered(estimate, queryset_iter): results.append( Result(algorithm=algorithm_tf_naive_bayes, text=text, value=str(result[0]))) self.pbar_increment() if len(results) >= 10000: if bulk_create_process and bulk_create_process.is_alive(): bulk_create_process.join() bulk_create_process = Process(target=bulk_create, kwargs={ 'database': database, 'results': copy.copy(results) }) bulk_create_process.start() results = [] self.pbar_destroy() print "Saving results" Result.objects.using(database).bulk_create(results) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) filters = {} if options['only_newsitems']: filters['newsitem__isnull'] = False elif options['only_comments']: filters['comment__isnull'] = False bulk_create_process = None for database in self.selected_dbs: print "Processing database " + database algorithm_anew_valence, _ = Algorithm.objects.using(database)\ .get_or_create(name="anew_valence") algorithm_anew_arousal, _ = Algorithm.objects.using(database)\ .get_or_create(name="anew_arousal") algorithm_anew_dominance, _ = Algorithm.objects.using(database)\ .get_or_create(name="anew_dominance") queryset = Text.objects.using(database).filter(**filters) print "Removing previous results" query = """ DELETE FROM result WHERE algorithm_id IN ( SELECT id FROM algorithm WHERE name IN ('anew_valence', 'anew_arousal', 'anew_dominance') )""" cursor = connections[database].cursor() cursor.execute(query) results = [] queryset_iter = queryset_iterator(queryset, chunksize=10000) print "Calculating..." self.pbar_setup(maxval=queryset.count()) pool = Pool() for text, work_results in pool.imap_unordered(work, queryset_iter): if work_results != None: results.append( Result(algorithm=algorithm_anew_arousal, text=text, value=work_results['arousal'])) results.append( Result(algorithm=algorithm_anew_valence, text=text, value=work_results['valence'])) results.append( Result(algorithm=algorithm_anew_dominance, text=text, value=work_results['dominance'])) if len(results) > 10000: if bulk_create_process and bulk_create_process.is_alive(): bulk_create_process.join() bulk_create_process = Process(target=bulk_create, kwargs={ 'database': database, 'results': copy.copy(results) }) bulk_create_process.start() results = [] self.pbar_increment() self.pbar_destroy() print "Saving last results..." Result.objects.using(database).bulk_create(results) results = [] self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): self.stdout.write('Command started') super(Command, self).handle(self, *args, **options) self.stdout.write("Connecting manually to target database") targetdb_conn = MySQLdb.connect( host = settings.GENERAL_DATA_DB_DETAILS['HOST'], user = settings.GENERAL_DATA_DB_DETAILS['USER'], passwd= settings.GENERAL_DATA_DB_DETAILS['PASSWORD'], db = settings.GENERAL_DATA_DB_DETAILS['NAME'], port = settings.GENERAL_DATA_DB_DETAILS['PORT'], charset = 'utf8', use_unicode = True ) targetdb_cursor = targetdb_conn.cursor() select_query = """ SELECT * FROM author_analysis WHERE newssite = %s """ for database in self.selected_dbs: self.stdout.write("Processing database "+database) self.stdout.write("Reading authors") self.authors = {} queryset = Author.objects.using(database).all() if queryset.count() > 0: self.stdout.write("Reading authors") self.pbar_setup(maxval=queryset.count()) queryset_iter = queryset_iterator(queryset, chunksize=1000) for author in queryset_iter: self.authors[author.name] = author self.pbar_increment() self.pbar_destroy() self.stdout.write("Moving data...") targetdb_cursor.execute(select_query, [database]) self.pbar_setup(maxval=targetdb_cursor.rowcount) results = [] for row in dictfetch(targetdb_cursor): if row['author'] in self.authors: le_author = self.authors[row['author']] else: le_author = Author.objects.using(database).create( name = row['author'] ) self.authors[item.idauthor] = le_author results.append(AuthorSummary( author = le_author, avg_nr_words = row['avg_nr_words'], avg_wordlen = row['avg_wordlen'], avg_words_gt6 = row['avg_words_gt6'], avg_personal = row['avg_personal'], avg_collective = row['avg_collective'], indegree = row['indegree'], indegree_centrality = row['indegree_centrality'], outdegree = row['outdegree'], outdegree_centrality = row['outdegree_centrality'], degree = row['degree'], degree_centrality = row['degree_centrality'], avg_shared = row['avg_shared'], pagerank = row['pagerank'], pagerank_weighted = row['pagerank_weighted'], nr_posts = row['nr_posts'], hub_score = row['hub_score'], authority_score = row['authority_score'], betweeness_centrality = row['betweeness_centrality'], closeness_centrality = row['closeness_centrality'], clustering_coef = row['clustering_coef'], eccentricity = row['eccentricity'], constraint = row['constraint'], polarity_arousal = row['polarity_arousal'], polarity_valence = row['polarity_valence'], )) if len(results) >= 1000: AuthorSummary.objects.using(database).bulk_create(results) results = [] self.pbar_increment() self.pbar_destroy() if len(results) > 0: AuthorSummary.objects.using(database).bulk_create(results) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): k = 5 print "Reading data to memory" TFIDF_list = [] label = [] queryset = Newsitem.objects.all() self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset, chunksize=100) for newsitem in newsitems: TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text))) if newsitem.cat1 in [1, 2]: label.append(1) else: label.append(0) self.pbar_increment() self.pbar_destroy() print "Creating traing and test data..." TFIDF_svm = [] for i in TFIDF_list: TFIDF_svm.append(utils.TFIDF_to_list(i)) # TFIDF_svm is the input matrix of SVM # Reads the train_len from command line #train_len=int(sys.argv[1]) train_len = 200 # Index of train samples from class 0 indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len] # Index of train samples from class 1 indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len] # We have K number of positive samples and also K number of negative samples train = [] train_label = [] for i in indexZero + indexOne: train.append(TFIDF_svm[i]) train_label.append(label[i]) # Train: train matrix # train_label: lables of train data # The other samples are test samples. test = [ TFIDF_svm[i] for i in range(len(TFIDF_svm)) if i not in indexZero + indexOne ] test_label = [ label[i] for i in range(len(label)) if i not in indexZero + indexOne ] print "Fitting..." clf = svm.SVC(probability=True) # Train the model clf.fit(train, train_label) #print "Score: " + clf.score(train, train_label) print "Generating probabilities" pred_probas = clf.predict_proba(test)[:, 1] fpr, tpr, _ = roc_curve(test_label, pred_probas) roc_auc = auc(fpr, tpr) print "Plotting..." plt.plot(fpr, tpr, label='area = %.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') print "Saving!" plt.savefig('out.png') self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): TFIDF_list = [] label = [] queryset = Newsitem.objects self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset.all(), chunksize=100) for newsitem in newsitems: TFIDF_list.append(self.tokenize(newsitem.text.text)) if newsitem.cat1 in [1, 2]: label.append(1) else: label.append(0) self.pbar_increment() self.pbar_destroy() self.train() print "Estimating..." self.pbar_setup(maxval=len(TFIDF_list)) counter1 = 0 TP = 0 TN = 0 FP = 0 FN = 0 while counter1 < len(TFIDF_list): distance_list = [] counter2 = 0 while counter2 < len(TFIDF_list): if counter1 != counter2: distance_list.append( utils.TFIDF_distance(TFIDF_list[counter1], TFIDF_list[counter2])) counter2 += 1 nearest_list = sorted(range(len(distance_list)), key=lambda i: distance_list[i])[:k] repeat_dic = {} for i in nearest_list: if repeat_dic.has_key(label[i]): repeat_dic[label[i]] += 1 else: repeat_dic[label[i]] = 1 estimate_label = max(repeat_dic, key=repeat_dic.get) if estimate_label == 1 and label[counter1] == 1: TP += 1 elif estimate_label == 1 and label[counter1] == 0: FN += 1 elif estimate_label == 0 and label[counter1] == 0: TN += 1 else: FP += 1 counter1 += 1 self.pbar_increment() self.pbar_destroy() data = [ ('algo_knn_tp', TP), ('algo_knn_fn', FN), ('algo_knn_fp', FP), ('algo_knn_tn', TN), ] print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN print "Saving algorithm results" for item in data: algorithm_name = item[0] value = item[1] algorithm, create = Algorithm.objects.get_or_create( name=algorithm_name) result, created = Result.objects.get_or_create(algorithm=algorithm) result.value = str(value) result.save() algo_knn_uniform_estimative, create = Algorithm.objects.get_or_create( name="algo_knn_uniform_estimative") print "Calculating estimatives and saving result" queryset = Newsitem.objects self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset.all(), chunksize=100) for newsitem in newsitems: data = utils.TFIDF(utils.tokenize(row[0])) distance_list = [] for i in range(len(TFIDF_list)): distance_list.append(utils.TFIDF_distance(data, TFIDF_list[i])) nearest_list = sorted(range(len(distance_list)), key=lambda i: distance_list[i])[:k] repeat_dic = {} for i in nearest_list: if distance_list[i] != 0: if repeat_dic.has_key(label[i]): repeat_dic[label[i]] += 1 else: repeat_dic[label[i]] = 1 estimate = max(repeat_dic, key=repeat_dic.get) Result.objects.create(algorithm=algo_knn_uniform_estimative, text=newsitem.text, value=str(estimate)) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def get_iterator(self): return itertools.chain.from_iterable([ queryset_iterator(queryset, chunksize=self.chunksize) for queryset in self.querysets ])
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) citations_re = re.compile("\".*\"") for database in self.selected_dbs: print "Processing database " + database cursor = connections[database].cursor() algorithm_list = ['afinn', 'TG_KNN3_TFIDF', 'TG_DT_TFIDF', 'TG_NB_TFIDF'] queryset = Algorithm.objects.using(database).filter( name__in = algorithm_list ) comments_score_query = """ SELECT newsitem.id nid, comment.id, result.value FROM newsitem, comment, text, result WHERE comment.NewsItemID = newsitem.id AND comment.text_id = text.id AND text.id = result.text_id AND result.algorithm_id = %s ORDER BY nid, value """ insert_summary_query = """ INSERT INTO `newsitem_pos_neg_comments`(`newsitem_id`, `algorithm_id`, `pos_comments`, `neg_comments`, `neutral_comments`) VALUES (%s,%s,%s,%s,%s) """ insert_detail_query = """ INSERT INTO `pos_neg_comments`(`algorithm_id`, `newsitem_id`, `comment_id`, `sentiment`) VALUES (%s,%s,%s,%s) """ cursor = connections[database].cursor() for algorithm in queryset_iterator(queryset, chunksize=1000): print "Processing algorithm {}".format(algorithm.name) cursor.execute(comments_score_query, [algorithm.id]) current_newsitem = 0 type_hash = Counter({POSITIVE_COMMENT: 0, NEGATIVE_COMMENT: 0, NEUTRAL_COMMENT: 0}) with transaction.atomic(): self.pbar_setup(maxval=cursor.rowcount) insert_cursor = connections[database].cursor() detail_rows_to_insert = [] summary_rows_to_insert = [] for row in dictfetch(cursor): if row['nid'] != current_newsitem: if current_newsitem > 0: summary_rows_to_insert.append([ current_newsitem, algorithm.id, type_hash[POSITIVE_COMMENT], type_hash[NEGATIVE_COMMENT], type_hash[NEUTRAL_COMMENT], ]) current_newsitem = row['nid'] type_hash = Counter({POSITIVE_COMMENT: 0, NEGATIVE_COMMENT: 0, NEUTRAL_COMMENT: 0}) else: comment_type = self.check_result(algorithm, row['value']) type_hash[comment_type] += 1 detail_rows_to_insert.append([ algorithm.id, row['nid'], row['id'], comment_type ]) if len(detail_rows_to_insert) >= 10000: insert_cursor.executemany(insert_detail_query, detail_rows_to_insert) detail_rows_to_insert = [] self.pbar_increment() if current_newsitem > 0: summary_rows_to_insert.append([ current_newsitem, algorithm.id, type_hash[POSITIVE_COMMENT], type_hash[NEGATIVE_COMMENT], type_hash[NEUTRAL_COMMENT], ]) insert_cursor.executemany(insert_summary_query, summary_rows_to_insert) self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) k = 5 print "Reading data to memory" TFIDF_list = [] label = [] queryset = Newsitem.objects.all() self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset, chunksize=100) for newsitem in newsitems: TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text))) if newsitem.cat1 in [1, 2]: label.append(1) else: label.append(0) self.pbar_increment() self.pbar_destroy() print "Training..." TFIDF_svm = [] for i in TFIDF_list: TFIDF_svm.append(utils.TFIDF_to_list(i)) # TFIDF_svm is the input matrix of SVM # Reads the train_len from command line #train_len=int(sys.argv[1]) train_len = 200 # Index of train samples from class 0 indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len] # Index of train samples from class 1 indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len] # We have K number of positive samples and also K number of negative samples train = [] train_label = [] for i in indexZero + indexOne: train.append(TFIDF_svm[i]) train_label.append(label[i]) # Train: train matrix # train_label: lables of train data # The other samples are test samples. test = [ TFIDF_svm[i] for i in range(len(TFIDF_svm)) if i not in indexZero + indexOne ] test_label = [ label[i] for i in range(len(label)) if i not in indexZero + indexOne ] clf = svm.SVC() # Train the model clf.fit(train, train_label) counter1 = 0 TP = 0 TN = 0 FP = 0 FN = 0 print "Estimating..." self.pbar_setup(maxval=len(test)) for i in test: estimate_label = clf.predict([i])[0] if estimate_label == 1 and label[counter1] == 1: TP += 1 elif estimate_label == 1 and label[counter1] == 0: FN += 1 elif estimate_label == 0 and label[counter1] == 0: TN += 1 else: FP += 1 counter1 += 1 self.pbar_increment() self.pbar_destroy() print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN data = [ ('algo_svm_tp', TP), ('algo_svm_fn', FN), ('algo_svm_fp', FP), ('algo_svm_tn', TN), ('algo_svm_score', clf.score(train, train_label)), ] print "Saving algorithm results" for item in data: algorithm_name = item[0] value = item[1] algorithm, create = Algorithm.objects.get_or_create( name=algorithm_name) result, create = Result.objects.get_or_create(algorithm=algorithm) result.value = str(value) result.save() algo_svm_estimative, create = Algorithm.objects.get_or_create( name="algo_svm_estimative") print "Calculating estimatives and saving result" queryset = Newsitem.objects self.pbar_setup(maxval=queryset.count()) newsitems = queryset_iterator(queryset.all(), chunksize=100) for newsitem in newsitems: estimate = clf.predict([ utils.TFIDF_to_list( utils.TFIDF(utils.tokenize(newsitem.text.text))) ]) Result.objects.create(algorithm=algo_svm_estimative, text=newsitem.text, value=str(estimate[0])) self.pbar_increment() self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) if len(self.selected_dbs) != 1: self.stdout.write(self.style.ERROR('You need to select exactly one database for this command')) return selected_db = self.selected_dbs[0] nodes = Element('nodes') edges = Element('edges') self.stdout.write("Processing NewsItems") queryset = Newsitem.objects.using(selected_db).only('id').all() self.pbar_setup(maxval=queryset.count()) for newsitem in queryset_iterator(queryset, chunksize=10000): nodes.append(Element('node', attrib={ 'id': "N{}".format(newsitem.id), 'label': "Newsitem {}".format(newsitem.id), })) self.pbar_increment() self.pbar_destroy() self.stdout.write("Processing Comments") queryset = Comment.objects.using(selected_db)\ .only('id', 'parent_id', 'newsitem_id')\ .all() edge_id = 0 self.pbar_setup(maxval=queryset.count()) for comment in queryset_iterator(queryset, chunksize=10000): comment_id = "C{}".format(comment.id) nodes.append(Element('node', attrib={ 'id': comment_id, 'label': "Comment {}".format(comment.id), })) if comment.parent_id: target_id = comment_id else: target_id = "N{}".format(comment.newsitem_id) edges.append(Element('edge', attrib={ 'id': str(edge_id), 'source': comment_id, 'target': target_id, 'type': 'directed' })) edge_id += 1 self.pbar_increment() self.pbar_destroy() self.stdout.write("Finishing...") graph = Element('graph', attrib={ 'mode': 'dynamic', 'defaultedgetype': 'directed', }) graph.append(nodes) graph.append(edges) meta = Element('meta', attrib={ 'lastmodifieddate': datetime.date.today().isoformat() }) meta.append(Element('creator', text="Iris Steenhout")) meta.append(Element('description', text="Newsitems and it's comments.")) gexf = Element('gexf', attrib={ 'xmlns': 'http://www.gexf.net/1.2draft', 'version': '1.2', }) gexf.append(meta) gexf.append(graph) self.stdout.write("Writing...") options['output'].write(tostring(gexf, encoding="UTF-8")) options['output'].close() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) lem = WordNetLemmatizer() stopwords = {} with open('stopwords.txt', 'rU') as f: for line in f: stopwords[line.strip()] = 1 for database in self.selected_dbs: print "Processing database " + database algorithm_tokenize, created = Algorithm.objects.using( database).get_or_create(name="vlad_tokenize") word_cache = WordCache() print "Excluding previous results" #Exclude all tokens and create them again Result.objects.using(database).filter( algorithm=algorithm_tokenize).delete() Word.objects.using(database).all().delete() print "Reading comments" non_tokenized_comments = Comment.objects.using(database).filter( text__isnull=False) total_comments = non_tokenized_comments.count() if total_comments == 0: print "No items to tokenize!" return comments = queryset_iterator(non_tokenized_comments.all(), chunksize=100) text_word = [] print "Startint process!" self.pbar_setup(maxval=total_comments) for comment in comments: tokens = nltk.word_tokenize(self.clean_text(comment.text.text)) text = [word for word in tokens if word not in stopwords] tagged_text = nltk.pos_tag(text) for word_text, tag in tagged_text: if tag in ['NN', 'NNS']: word_id = word_cache.get(word_text) if not word_id: word_attributes = { 'word': word_text, 'tag': tag, 'noun': lem.lemmatize(word_text) } word_id = word_cache.save(word_attributes) text_word.append((word_id, comment.text.id)) self.pbar_increment() self.pbar_destroy() print "Bulk creating words" Word.objects.using(database).bulk_create( [Word(**attrib) for attrib in word_cache.get_as_list()]) print "Opening cursor" cursor = connections[database].cursor() query = 'INSERT IGNORE INTO word_texts (word_id, text_id) VALUES (%s, %s)' print "Inserting rows for relation word-text..." self.pbar_setup(maxval=len(text_word)) while len(text_word) > 0: chunk = [] while len(chunk) < 10000: try: chunk.append(text_word.pop()) self.pbar_increment() except IndexError: break cursor.executemany(query, chunk) transaction.commit(database) self.pbar_destroy() self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): super(Command, self).handle(self, *args, **options) #Create temporary directory to write the corpus LDA-C files temp_dir_path = tempfile.mkdtemp() corpus_path = temp_dir_path + "/corpus.lda-c" lda_num_topics = 50 for database in self.selected_dbs: print "Processing database " + database #Building dictionary print "Building dictionary" dictionary = Dictionary() queryset = Comment.objects.using(database).exclude( text__isnull=True) self.pbar_setup(maxval=queryset.count()) for comment in queryset_iterator(queryset, chunksize=50): dictionary.add_documents( [[word.word for word in comment.text.word_set.all()]]) self.pbar_increment() self.pbar_destroy() dictionary.filter_extremes(keep_n=10000) dictionary.compactify() #Serialize corpus print "Serializing corpus" corpus = Corpus( queryset_iterator(Comment.objects.using(database).all(), chunksize=50), dictionary) BleiCorpus.serialize(corpus_path, corpus, id2word=dictionary) #Train print "Training..." bleicorpus = BleiCorpus(corpus_path) lda = gensim.models.LdaModel(bleicorpus, num_topics=lda_num_topics, id2word=dictionary) #Saving print "Saving results to DB" lda_db_obj, created = Algorithm.objects.using( database).get_or_create(name='LDA') #Removing previous results lda_db_obj.result_set.all().delete() #Looping through results and saving to DB i = 0 for topic in lda.show_topics(num_topics=lda_num_topics): Result.objects.using(database).create(sequence=i, value=str(topic), algorithm=lda_db_obj) i += 1 #Remove temporary directory #Check first if it's not the current working directory, as removing it # would be a disaster! ;) if os.getcwd() != temp_dir_path: #Just remove it if it's a temp dir shutil.rmtree(temp_dir_path) else: #If it's the current working directory, just remove the uneeded files map(os.remove, glob.glob('corpus.lda-c*')) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
def handle(self, *args, **options): self.stdout.write('Command started') super(Command, self).handle(self, *args, **options) #self.edges = [] # #self.stdout.write("Connecting manually to target database") #targetdb_conn = MySQLdb.connect( # host = settings.GENERAL_DATA_DB_DETAILS['HOST'], # user = settings.GENERAL_DATA_DB_DETAILS['USER'], # passwd= settings.GENERAL_DATA_DB_DETAILS['PASSWORD'], # db = settings.GENERAL_DATA_DB_DETAILS['NAME'], # port = settings.GENERAL_DATA_DB_DETAILS['PORT'], # charset = 'utf8', # use_unicode = True #) #targetdb_cursor = targetdb_conn.cursor() # #select_query = """ # SELECT * FROM author_analysis WHERE newssite = %s #""" for database in self.selected_dbs: self.stdout.write("Processing database " + database) self.authors = {} queryset = Author.objects.using(database).all() if queryset.count() > 0: self.stdout.write("Reading authors") self.pbar_setup(maxval=queryset.count()) queryset_iter = queryset_iterator(queryset, chunksize=1000) for author in queryset_iter: self.authors[author.name] = author self.pbar_increment() self.pbar_destroy() for model in [Comment, Newsitem]: if model == Comment: model_name = "Comment" filters = { 'authorshortname__isnull': False, 'author__isnull': True } else: model_name = "Newsitem" filters = { 'idauthor__isnull': False, 'author__isnull': True } self.stdout.write("Processing model " + model_name) queryset = model.objects.using(database)\ .filter(**filters) self.stdout.write('Linking Authors with ' + model_name) if queryset.count() == 0: self.stdout.write('No authors to link') continue self.pbar_setup(maxval=queryset.count()) queryset_iter = queryset_iterator(queryset, chunksize=1000) for item in queryset_iter: if model == Comment: if item.authorshortname in self.authors: item.author = self.authors[item.authorshortname] else: le_author = Author.objects.using(database).create( name=item.authorshortname, long_name=item.authorid) item.author = le_author self.authors[item.authorshortname] = le_author else: if item.idauthor in self.authors: item.author = self.authors[item.idauthor] else: le_author = Author.objects.using(database).create( name=item.idauthor) item.author = le_author self.authors[item.idauthor] = le_author item.save() self.pbar_increment() self.pbar_destroy() bulk_update(queryset, batch_size=1000, using=database, update_fields=['author']) self.stdout.write(self.style.SUCCESS('Command executed succesfully'))