def ComputeVocabulary(): try: cursor.execute("select commentBody from vocab_comments") n = 0 for row in cursor: n = n + 1 if n % 100 == 0: print n ct = CleanAndTokenize(row[0]) ct = [w for w in ct if w not in stopword_list] stemmed_tokens = [porter.stem(t) for t in ct] for t in stemmed_tokens: if t not in doc_frequency: doc_frequency[t] = 1 else: doc_frequency[t] = doc_frequency[t] + 1 sorted_list = sorted(doc_frequency.items(), key=operator.itemgetter(1), reverse=True) # find cutoff unigram_cutoff = 0 json_data = {} out_file = open("apidata/vocab_freq.json", "w") for (i, (word, word_freq)) in enumerate(sorted_list): if word_freq < 10: unigram_cutoff = i - 1 break json_data[word] = word_freq json.dump(json_data, out_file) print "unigram cutoff: " + str(unigram_cutoff) except: print error_name(g_day, g_offset) sys.exit(1)
def calcPersonalXPScores(comment_text): # comment_text = comment_text.decode("utf-8") # tokenizer = WhitespaceTokenizer() personal_xp_score = 0 text = comment_text.lower() #filter out punctuations punctuations = string.punctuation # includes following characters: !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ excluded_punctuations = ["$", "%", "'"] for p in punctuations: if p not in excluded_punctuations: text = text.replace(p, " ") # tokenize it token_list = CleanAndTokenize(comment_text) text_tokens = token_list # comment_stemmed_tokens = [porter.stem(token) for token in token_list] # if the tokens are in the personal_words List then increment score for tok in text_tokens: tok_stem = porter.stem(tok) if tok_stem in personal_words: personal_xp_score = personal_xp_score + 1 # normalize by number of tokens if len(text_tokens) > 0: personal_xp_score = float(personal_xp_score) / float(len(text_tokens)) else: personal_xp_score = 0.0 return round(personal_xp_score,3)
def calLength(comment_text): token = CleanAndTokenize(comment_text) return len(token)
def ComputeCommentArticleRelevance(comment_text, ID, operation): cnx = mysql.connector.connect(user=user, password=password, host=host, database=database) cursor = cnx.cursor() if operation == 'add': articleID = ID cursor.execute("select full_text from articles where articleID = '" + str(articleID) + "'") article_data = cursor.fetchall() elif operation == 'update': commentID = ID cursor.execute("select articleID from comments where commentID ='" + str(commentID) + "' ") fetch_data = cursor.fetchall() if len(fetch_data) > 0: articleID = fetch_data[0][0] else: ArticleRelevance = 0.0 return ArticleRelevance cursor.execute("select full_text from articles where articleID = '" + str(articleID) + "'") article_data = cursor.fetchall() else: ArticleRelevance = 0.0 return ArticleRelevance cnx.close if len(article_data) < 1: ArticleRelevance = 0.0 return ArticleRelevance for data in article_data: article_text = data[0] comment_text = escape_string(comment_text.strip()) # clean and tokenize the comment text and article text, also exclude the stopwords token_list = CleanAndTokenize(comment_text) token_list = [word for word in token_list if word not in stopword_list] comment_stemmed_tokens = [porter.stem(token) for token in token_list] comment_stemmed_tokens_fd = FreqDist(comment_stemmed_tokens) token_list = CleanAndTokenize(article_text) token_list = [word for word in token_list if word not in stopword_list] article_stemmed_tokens = [porter.stem(token) for token in token_list] article_stemmed_tokens_fd = FreqDist(article_stemmed_tokens) # now create the feature vectors for article and comment article_features = {} comment_features = {} # Calculate weight for each word in the comment with tf-idf for w in vocab_freq: df = vocab_freq[w] log_fraction = (nDocuments / df) if log_fraction < 1: log_fraction = Decimal(nDocuments) / Decimal(df) if w in article_stemmed_tokens: article_features[w] = article_stemmed_tokens_fd[w] * math.log( log_fraction) else: article_features[w] = 0.0 if w in comment_stemmed_tokens: comment_features[w] = comment_stemmed_tokens_fd[w] * math.log( log_fraction) else: comment_features[w] = 0.0 # normalize vectors article_features = NormalizeVector(article_features) comment_features = NormalizeVector(comment_features) comment_article_similarity = ComputeCosineSimilarity( article_features, comment_features) return comment_article_similarity
def ComputeCommentConversationalRelevance(comment_text, ID, operation): cnx = mysql.connector.connect(user=user, password=password, host=host, database=database) cursor = cnx.cursor() if operation == 'add': articleID = ID cursor.execute("select commentBody from comments where articleID = '" + str(articleID) + "' ") comment_data = cursor.fetchall() elif operation == 'update': commentID = ID cursor.execute("select articleID from comments where commentID ='" + str(commentID) + "' ") fetch_data = cursor.fetchall() if len(fetch_data) > 0: articleID = fetch_data[0][0] else: ConversationalRelevance = 0.0 return ConversationalRelevance cursor.execute("select commentBody from comments " "where articleID = '" + str(articleID) + "' and commentID < '" + str(commentID) + "' ") comment_data = cursor.fetchall() else: ConversationalRelevance = 0.0 return ConversationalRelevance cnx.close if len(comment_data) < 2: ConversationalRelevance = 0.0 return ConversationalRelevance centroid_comment_stemmed_tokens = [] centroid_comment_features = {} # clean and tokenize the all the comments text and also exclude the stopwords comment_list = list(zip(*comment_data)[0]) for comment in comment_list: token_list = CleanAndTokenize(comment) token_list = [word for word in token_list if word not in stopword_list] # Update and compute the centroid centroid_comment_stemmed_tokens.extend( [porter.stem(token) for token in token_list]) centroid_comment_stemmed_tokens_fd = FreqDist( centroid_comment_stemmed_tokens) # Calculate weight for each word in all the comments with tf-idf for w in vocab_freq: log_fraction = (nDocuments / vocab_freq[w]) if log_fraction < 1: log_fraction = Decimal(nDocuments) / Decimal(vocab_freq[w]) if w in centroid_comment_stemmed_tokens: centroid_comment_features[w] = centroid_comment_stemmed_tokens_fd[ w] * math.log(log_fraction) else: centroid_comment_features[w] = 0.0 # normalize vector centroid_comment_features = NormalizeVector(centroid_comment_features) # Now compute distance to comment comment_stemmed_tokens = [] comment_features = {} comment_text = escape_string(comment_text.strip()) token_list = CleanAndTokenize(comment_text) token_list = [word for word in token_list if word not in stopword_list] comment_stemmed_tokens.extend([porter.stem(token) for token in token_list]) comment_stemmed_tokens_fd = FreqDist(comment_stemmed_tokens) # Calculate weight for each word in the comment with tf-idf for w in vocab_freq: log_fraction = (nDocuments / vocab_freq[w]) if log_fraction < 1: log_fraction = Decimal(nDocuments) / Decimal(vocab_freq[w]) if w in comment_stemmed_tokens: comment_features[w] = comment_stemmed_tokens_fd[w] * math.log( log_fraction) # print str(comment_features[w]) + " = " + str(comment_stemmed_tokens_fd[w]) + " * " + str(math.log(log_fraction)) else: comment_features[w] = 0.0 comment_features = NormalizeVector(comment_features) comment_originality = ComputeCosineSimilarity(centroid_comment_features, comment_features) return comment_originality