def classifiy_wiki_article(search_str, tfidfs_per_doc, idfs, lem_flag=False): ''' Get an article from Wikipedia and cllassify it against the provided data TODO: not working... ''' base_wiki = 'http://en.wikipedia.org/wiki/' wiki_url = base_wiki+search_str.replace(' ', '_') ar_text = \ get_wiki.get_specific_wikipedia_article(wiki_url, markup=False) print ar_text article_words = str_corpus_cleaner.get_clean_terms(ar_text, lem_flag) return classify_article_words(article_words, tfidfs_per_doc, idfs)
def classify_article_file(article_path, tfidfs_per_doc, idfs, lem_flag=False): ''' classify a single article. -Return: matched category and similarity scores for all categories. ''' st_time = time.time() ar_text = codecs.open(article_path, 'rU').read() article_words = str_corpus_cleaner.get_clean_terms(ar_text, lem_flag) # Classify article match, all_scores = classify_article_words( \ article_words, tfidfs_per_doc, idfs) match = match.split('.')[0] # remove file extension, if any #print "%s\t%s\t%.3e\t%.3f sec" % \ # ( article_path.split('/')[-1].replace('.txt',''), \ # match[0], match[1], time.time()-st_time ) return match, all_scores