def get_training_data_for_sense_cache(sense, lang): if '/' in sense or sense == "Bomb the Bass" or is_multi_cap(sense) or sense == "Kirklees" or sense == "Fallout 2": return [] file_name = get_joined_name(['sense', lang, sense]) sense_data = get_from_cache(file_name) if sense_data: debug("# Getting (cached) sense training data for " + sense) return sense_data debug("Getting sense training data for " + sense) output_file_name = 'output-' + sense + '-' + lang + '.xml' output = subprocess.check_output(['python', 'whatLinksHere.py', \ sense, lang, str(NUM_WHATLINKS_PER_WORD)]) sense_link_re = get_sense_link_re(sense) pos_entries_by_paragraph = [] if os.path.exists(output_file_name): file_size = os.path.getsize(output_file_name) debug(file_size) pages = parse_wiki_xml.parse_articles_xml(output_file_name) ignored, pos_entries_by_paragraph = \ get_annotated_paragraphs_in_pages(pages, lang, sense_link_re) sense_data = pos_entries_by_paragraph if not is_likely_lower_sense(sense_data): sense_data = [] insert_into_cache(file_name, sense_data) remove_if_exists(output_file_name) return sense_data
def getStopWords(LANGUAGE_CODE, NUM_ARTICLES, NUM_WORDS): # Get random ids from random api num_pages_scraped = 0 allText = "" while (num_pages_scraped < NUM_ARTICLES): randUrl = "http://" + LANGUAGE_CODE + ".wikipedia.org/w/api.php?action=query&list=random&format=xml&rnlimit=10" pageids = [] for pageid in parse_wiki_xml.parse_random_articles_xml(urllib.urlopen(randUrl)): pageids.append(str(pageid)) num_pages_scraped += len (pageids) pagesStr = "|".join(pageids) pagesStr = pagesStr[0:-1] # get rid of last pipe url = 'http://' + LANGUAGE_CODE + '.wikipedia.org/w/api.php?action=query&prop=revisions&redirects&rvprop=content&format=xml&pageids=' + pagesStr print "Scraping from url: " + url # print "Fetching from " + url articles = parse_wiki_xml.parse_articles_xml(urllib.urlopen(url)) for article in articles: if 'content' in article and article['content']: allText += article['content'] print "Length of all articles scraped: " + str(len(allText)) cnt = collections.Counter() words = re.findall(r'\w+', allText.lower()) mostFrequent = collections.Counter(words).most_common(NUM_WORDS) print mostFrequent output = "" for t in mostFrequent: output += t[0] + "," + str(t[1]) + "\n" return output
def wsd(xml_file_name, lang): global wsd_output_file stop_words = get_stop_words(lang) for page in parse_wiki_xml.parse_articles_xml(xml_file_name): output_file_base = os.path.join(lang + '2', page['pageid'] + '-' + lang) output_file_tmp = output_file_base + '.tmp' output_file_name = output_file_base + '.txt' debug("Disambiguating " + page['title']) if os.path.exists(output_file_name): continue wsd_output_file = open(output_file_tmp, 'w') wsd_page(page['pageid'], page['title'], page['content'], lang, stop_words) wsd_output_file.close() wsd_output_file = None os.rename(output_file_tmp, output_file_name)