def filterDocuments(): docFile = open(path+"/data.txt") toWrite = open(path+"/filteredData.txt", 'w') lessThanFive = Unpickler(open(path+"/lessFrequentWords.txt")).load() i=0 for line in docFile: if i%10000==0: print i i+=1 line = line.decode("utf-8") line = line.strip() if line: cat,doc = line.split("\t") words = Utility.getWords(doc) filtered = [] for w in words: if w not in lessThanFive: filtered.append(w.encode("utf-8")) toWrite.write(cat.encode("utf-8")+"\t"+(" ".join(filtered))) toWrite.write("\n") docFile.close() toWrite.close()
def getWordCounts(): docFile = open(path+"/data.txt") wCounts = {} print "presmetuvame frekfencija na zborovi" i=0 for line in docFile: if i%10000==0: print i i+=1 line = line.decode("utf-8") line = line.strip() if line: cat,doc = line.split("\t") words = Utility.getWords(doc) for w in words: if w not in wCounts: wCounts[w]=0 wCounts[w]=wCounts[w]+1 docFile.close() lessThanFive = set() for w,c in wCounts.items(): if c<=5: lessThanFive.add(w) toWrite = open(path+"/lessFrequentWords.txt",'w') Pickler(toWrite).dump(lessThanFive) toWrite.close()
def testNaiveBayes(): begin = time.time() counter = 0 fileToRead = open(naivebayes_classification.str_dict_word_in_cat) dict_words = Unpickler(fileToRead).load() fileToRead.close() fileToRead = open(naivebayes_classification.str_dict_cat_count) dict_cat_count = Unpickler(fileToRead).load() fileToRead.close() fileToRead = open(naivebayes_classification.str_dict_priors) dict_priors = Unpickler(fileToRead).load() fileToRead.close() numErrors = 0 for line in open(Utility.path_dataset): parts = line.decode('utf-8').strip().split('\t') category = parts[0] text = parts[1] words = Utility.getWords(text) nb_category = get_NB_category(words, dict_words, dict_cat_count, dict_priors) if nb_category != category: #print 'correct category: %s my category: %s' % (category, nb_category) numErrors += 1 counter += 1 if counter % 5000 == 0: print 'counter: %d\n' % counter print 'time: %d\n' % (time.time() - begin) print 'accuracy: %f' % (100*(1 - numErrors*1.0/counter))
def buildDicts(file_name, save_dicts=False): """ :param file_name: the file that contains the pairs (category, sentence) where we build the dictionary from :param save_dicts: variable indicating whether to save the dictionaries in files or just just to return them :return: returns dict_cat_count, dict_word_in_cat, """ counter = 0 # priors (priors[cat] <- how many documents are in category cat) priors = {} # for a given category as a key, this returns how many words there are as a value dict_cat_count = {} # for a given category and a word as keys (dict_word_in_cat[category][word]), # we get how many times that word has appeared in that category dict_word_in_cat = {} for line in open(file_name): parts = line.decode('utf-8').strip().split('\t') category = parts[0] text = parts[1] priors[category] = 1 + priors.get(category, 0) words = Utility.getWords(text) dict_cat_count[category] = len(words) + dict_cat_count.get(category, 0) dict_word_in_cat.setdefault(category, {}) for w in words: dict_word_in_cat[category][w] = 1 + dict_word_in_cat[category].get(w, 0) counter += 1 if counter % 10000 == 0: print 'processed %d news' % counter for cat in priors: priors[cat] = (priors[cat] * 1.0) / counter print 'Category %s num words: %d' % (cat, dict_cat_count[cat]) print 'num unique words: %d' % (len(dict_word_in_cat)) if save_dicts: print 'Saving the dicts..' fileToWrite = open(str_dict_cat_count, 'w') Pickler(fileToWrite).dump(dict_cat_count) fileToWrite.close() fileToWrite = open(str_dict_word_in_cat, 'w') Pickler(fileToWrite).dump(dict_word_in_cat) fileToWrite.close() fileToWrite = open(str_dict_priors, 'w') Pickler(fileToWrite).dump(priors) fileToWrite.close() print 'Saved the dicts!' return dict_cat_count, dict_word_in_cat, priors
def getNewsPosts(source_object, web_page_url, dict_IDF): """ The main function which crawls a particular link and returns news posts as object that have been extracted from that link. Needs revising and (possibly) modifying the process of text extraction. :param sourceObject: the source object that wraps multiple web_page_urls (we need it for creating the news post object) :param web_page_url: the web page url where we extract the information from :param dictIDF: the idf dictionary that we need to calculate tf_idf for a document :return: list of news posts and a feedback (for logging) """ #feedback variable for logging feedback = '' try: #opening the url and reading the content c = urlopen(web_page_url) content = c.read() soup = BeautifulSoup(content) logging.debug('getNewsPosts: failed on reading web_page_url') logging.debug('instantiated beautiful soup') #the list of object that we are going to return newsPosts = [] for item in soup.findAll('item'): #in each item we have a link to the news that we would like to process try: #title of the news title = item.find('title').string #link to the news link_url = item.find('link').string feedback += 'title: %s\n' % title feedback += 'link_url: %s\n' % link_url pub_date = item.find('pubdate') if pub_date is not None: pub_date = pub_date.string datetime_obj = parse(pub_date, ignoretz=True) feedback += 'pub_date: %s\n' % (datetime_obj.strftime('%B %d %Y %H:%M')) date_milli = (datetime_obj - epoch).total_seconds() * 1000.0 pub_date = date_milli feedback += 'milli: %f\n' % date_milli else: pub_date = 0 feedback += 'pub_date: None\n' same_news_posts = NewsPost.query(NewsPost.url == link_url).fetch() #we must not process the same news twice if same_news_posts is not None and len(same_news_posts) > 0: feedback += 'There is/are already news post/s with this link. Continuing..\n' feedback += '------------------------------\n' continue img_url = None #we try to fetch the photo url directly from the rss feed, if not possible we will try later again if (item.description is not None) and (item.description.string is not None): img_obj = BeautifulSoup(item.description.string).find('img') if img_obj is not None: img_url = img_obj['src'] elif item.description is not None: img_obj = item.description.find('img') if img_obj is not None: img_url = img_obj['src'] #here we get the content of the news link_content = urlopen(link_url).read() innerSoup = BeautifulSoup(link_content) title_words = Utility.getWords(title) title_words = filterTitles(title_words, web_page_url) total_words = title_words # add title twice, because we consider those words in the title twice as important as the other words total_words.extend(total_words) #which paragraphs to take into consideration text = '' for script in innerSoup(['script', 'style']): script.extract() if web_page_url in Utility.fetch_text_specifications: specifications = Utility.fetch_text_specifications[web_page_url] if isinstance(specifications, list): #we take the paragraphs start = Utility.fetch_text_specifications[web_page_url][0] end = len(innerSoup.findAll('p')) if len(Utility.fetch_text_specifications[web_page_url]) > 1: end = Utility.fetch_text_specifications[web_page_url][1] for p in innerSoup.findAll('p')[start:end]: text += '%s ' % p.text else: tag_type = specifications['tag_type'] attr_type = specifications['attribute_type'] attr_value = specifications['attribute_value'] sections = innerSoup.findAll(tag_type, {attr_type: attr_value}) if 'nested_tag_type' in specifications: #we need to go one level deeper nested_tag_type = specifications['nested_tag_type'] nested_attr_type = specifications['nested_attribute_type'] nested_attr_value = specifications['nested_attribute_value'] limit = specifications.get('limit', 1000) new_sections = [] for section in sections: new_sections.extend(section.findAll(nested_tag_type,{ nested_attr_type: nested_attr_value}, limit=limit)) sections = new_sections for section in sections: text += '%s ' % section.text description = text[:min(100, len(text))] total_words.extend(Utility.getWords(text)) num_words = len(total_words) if num_words < 7: continue dict_news = {} for word in total_words: dict_news[word] = 1 + dict_news.get(word, 0) #we are trying to get the image from the news if img_url is None: imgs = innerSoup.findAll('img') img_url = '' if imgs is not None and len(imgs) > 0: img_url = imgs[0]['src'] #deal with the pictures with relative path to the web if (img_url is not None) and (len(img_url) > 0): if img_url.find(source_object.url) != 0: img_url = source_object.url + '/' + img_url feedback += 'img_url: %s\n' % img_url newsPost = NewsPost(parent=ndb.Key('NewsPost', link_url or "*notitle*"), url = link_url, host_page = web_page_url, title = title, dictWords = dict_news, numWords = num_words, words = total_words , source_id = source_object.id, source_url = source_object.url, img_url = img_url, pub_date = pub_date, description = description) newsPost.calculate_tf_idf(dict_IDF) newsPost.put() newsPosts.append(newsPost) feedback += '------------------------------\n' except Exception as inst: feedback += 'Inner Exception type: %s\n' % str(type(inst)) feedback += 'Inner Exception message: %s\n' % inst.message return newsPosts, feedback except Exception as inst: feedback += 'Exception type: %s\n' % type(inst) feedback += 'Exception message: %s\n' % inst.message #if there is an exception, we return and empty list of news posts return [], feedback
def parse_rss_feed(rss_feed_url, stop_after=None): feedback = '' #model = get_persisted_model() _,ind_to_cat = Utility.load_categories() #vectorizer = get_persisted_vectorizer() #feedback += 'In parse rss feed!!!\n' try: #opening the url and reading the content #feedback += 'trying to open the content %s\n' % rss_feed_url c = urlopen(rss_feed_url) #feedback += 'Opened the content\n' content = c.read() #feedback += 'Read the content' soup = BeautifulSoup(content) #feedback += 'Opened the content\n' ind = 0 tuples = [] for item in soup.findAll('item'): ind += 1 #feedback += 'Item num: %d\n' % ind if (stop_after is not None) and ind > stop_after: break try: title = item.find('title').string link_url = item.find('link').string description_object = item.find('description') description = '' for p in BeautifulSoup(description_object.string).findAll('p'): description += '%s ' % p.text feedback += 'title: %s\n' % title.strip() feedback += 'link_url: %s\n' % link_url feedback += 'description: %s\n' % description pub_date = item.find('pubdate') if pub_date is not None: pub_date = pub_date.string datetime_obj = parse(pub_date, ignoretz=True) feedback += 'pub_date: %s\n' % (datetime_obj.strftime('%B %d %Y %H:%M')) date_milli = (datetime_obj - epoch).total_seconds() * 1000.0 feedback += 'milli: %f\n' % date_milli else: feedback += 'pub_date: None\n' text = '' img_url = None #we try to fetch the photo url directly from the rss feed, if not possible we will try later again if (item.description is not None) and (item.description.string is not None): img_obj = BeautifulSoup(item.description.string).find('img') if img_obj is not None: img_url = img_obj['src'] elif item.description is not None: img_obj = item.description.find('img') if img_obj is not None: img_url = img_obj['src'] #here we get the content of the news link_content = urlopen(link_url).read() innerSoup = BeautifulSoup(link_content) for script in innerSoup(['script', 'style']): script.extract() if rss_feed_url in Utility.fetch_text_specifications: specifications = Utility.fetch_text_specifications[rss_feed_url] if isinstance(specifications, list): #we take the paragraphs start = Utility.fetch_text_specifications[rss_feed_url][0] end = len(innerSoup.findAll('p')) if len(Utility.fetch_text_specifications[rss_feed_url]) > 1: end = Utility.fetch_text_specifications[rss_feed_url][1] for p in innerSoup.findAll('p')[start:end]: text += p.text else: tag_type = specifications['tag_type'] attr_type = specifications['attribute_type'] attr_value = specifications['attribute_value'] #feedback += 'tag_type: %s attr_type: %s attr_value: %s\n' % (tag_type, attr_type, attr_value) sections = innerSoup.findAll(tag_type, {attr_type: attr_value}) #feedback += 'tags size: %d\n' % len(sections) #if we need to go deeper if 'nested_tag_type' in specifications: nested_tag_type = specifications['nested_tag_type'] nested_attr_type = specifications['nested_attribute_type'] nested_attr_value = specifications['nested_attribute_value'] limit = specifications.get('limit', 1000) recursive = specifications.get('recursive', True) new_sections = [] for section in sections: new_sections.extend(section.findAll(nested_tag_type,{ nested_attr_type: nested_attr_value}, limit=limit, recursive=recursive)) sections = new_sections for section in sections: feedback += 'tag name: %s\n' % section.name text += section.text #we are trying to get the image from the news if img_url is None: imgs = innerSoup.findAll('img') img_url = '' if imgs is not None and len(imgs) > 0: img_url = imgs[0]['src'] feedback += 'img_url: %s\n' % img_url feedback += 'text: %s\n' % text.strip() #X_train = vectorizer.transform([text]) #cat_ind = model.predict(X_train) #feedback +='CATEGORY: %s' % ind_to_cat[cat_ind[0]] feedback += '------------------------------\n' tuples.append((title,link_url, Utility.getWords(text))) except Exception as inst: feedback += 'Inner Exception type: %s\n' % str(type(inst)) feedback += 'Inner Exception message: %s\n' % inst.message feedback += 'Numer of posts: %d\n' % ind #go from front front = -1 for ind in xrange(100): num_documents = 0 everywhere = True for i in xrange(1, len(tuples)): if not tuples[i-1][2][ind] == tuples[i][2][ind]: everywhere = False break if not everywhere: front = 1 + ind break return feedback except Exception as inst: feedback += 'Exception type: %s\n' % type(inst) feedback += 'Exception message: %s\n' % inst.message #if there is an exception, we return and empty list of news posts return feedback