def scanWikiExtractorFile(fileName): print "reading file ", fileName with open(fileName, 'r') as f: content = f.read() articles = content.split('</doc>') for article in articles[:-1]: # print article try: soup = BeautifulStoneSoup(article) except UnicodeEncodeError: print "UnicodeEncodeError" pass continue title = soup.find('doc')['title'] text = soup.getText() # Now title can contain '/', ':' and all such characters which make file names invalid for c in invalid_filename_chars: if c in title: title = title.replace(c, ' ') print title hindiArtName = title g = codecs.open(HINDI_SAVE_DIR + '/' + hindiArtName, 'w', encoding='utf-8-sig') g.write(text) g.close() # break return
def scanWikiExtractorFile(fileName): global comparableFileNumber global comparableMap print "reading file ", fileName with open(fileName, 'r') as f: content = f.read() articles = content.split('</doc>') for article in articles[:-1]: # print article try: soup = BeautifulStoneSoup(article) except UnicodeEncodeError: print "UnicodeEncodeError" pass continue title = soup.find('doc')['title'] if title in comparableMap: # this english article has a corresponding hindi article print title, 'found' comparableFileNumber += 1 text = soup.getText() # Why to go into pain of renaming title since we are not storing file by title anyways !! # Now title can contain '/', ':' and all such characters which make file names invalid # for c in invalid_filename_chars: # if c in title: # title = title.replace(c, ' ') # print title # engArtName = title g = codecs.open(CORRESPONDING_ENGLISH_SAVE_DIR+'/'+'en_' + str(comparableFileNumber), 'w', encoding='utf-8-sig') g.write(text) g.close() # now we need to move the comparable hindi article into the CORRESPONDING_HINDI_SAVE_DIR # But why didn't we move all hindi articles existing in the comparableMap into CORRESPONDING_HINDI_SAVE_DIR # anyways instead of doing it one by one now. # This is because all those hindi ids may not have their corresponding english articles in this wiki dump # Moreover, since we have to document align english with hindi, so we would first have had to store # the comparableFileNumber beforehand for all hindi english articles and use it here instead of generating # a new one. It could have led to gaps in b/w if some english articles could not be found ! hindiId = comparableMap[title] shutil.copyfile(HINDI_ARTICLES_DIR+'/'+hindiId, CORRESPONDING_HINDI_SAVE_DIR+'/'+'hi_'+str(comparableFileNumber)) # Now remove this mapping from dictionary since all keys are unique, so we won't get this again. # But removing keys would definitely reduce size of map and fasten up the worst case lookup time del comparableMap[title] return
def sentenceToWordlistHindi(sentence, remove_stopwords=False): soup = BeautifulStoneSoup(sentence) if soup is not None: sentence_text = soup.getText() else: print "soup has not been cooked yet !" return [] words = sentence_text.split() # if IF_STEMMING: # don't do stemming here, shallow parser is worthless ! # words = stemHindi(sentence_text) if remove_stopwords: stops = set(stopwords.words("hindi")) words = [w for w in words if not w in stops] return words
def sentenceToWordlistHindi(sentence, remove_stopwords=False): soup = BeautifulStoneSoup(sentence) if soup is not None: sentence_text = soup.getText() else: print "soup has not been cooked yet !" return [] words = sentence_text.split() # if IF_STEMMING: # don't do stemming here, shallow parser is worthless ! # words = stemHindi(sentence_text) if remove_stopwords: stops = set(stopwords.words("hindi")) words = [w for w in words if not w in stops] return words
def sentenceToWordlistEnglish(sentence, remove_stopwords=False): soup = BeautifulStoneSoup(sentence) if soup is not None: sentence_text = soup.getText() else: print "soup has not been cooked yet !" return [] # 2. Remove non-letters # sentence_text = re.sub("[^a-zA-Z]"," ", sentence_text) words = sentence_text.lower().split() if IF_STEMMING: for i in range(len(words)): words[i] = ENstemmer.stem(words[i]) # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return (words)
def sentenceToWordlistEnglish(sentence, remove_stopwords=False): soup = BeautifulStoneSoup(sentence) if soup is not None: sentence_text = soup.getText() else: print "soup has not been cooked yet !" return [] # 2. Remove non-letters # sentence_text = re.sub("[^a-zA-Z]"," ", sentence_text) words = sentence_text.lower().split() if IF_STEMMING: for i in range(len(words)): words[i] = ENstemmer.stem(words[i]) # 4. Optionally remove stop words (false by default) if remove_stopwords: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return(words)
def download_message(message_id, message_path, yahoo_group): mkdir_p(message_path) header_filepath = message_path+'/header' body_filepath = message_path+'/body' na_filepath = message_path+'/na' allhtml_filepath = message_path+'/all_html' if os.path.exists(message_path): if os.path.exists(na_filepath): return if os.path.exists(header_filepath) and os.path.exists(body_filepath): return msg_url = '%s/%s/message/%s?source=1&unwrap=1'%(YG_BASE_URL,yahoo_group,message_id) # sleep_duration = HUMAN_WAIT + random.randint(0,HUMAN_REFLEX) # if VERBOSE and sleep_duration: # print ".... sleep % .... "%sleep_duration # time.sleep(sleep_duration) tc.go(msg_url) b = tc.get_browser() html = b.get_html() pattern_invalid = re.compile("Message (%s)? does not exist in %s"%(message_id, yahoo_group)) m0 = re.search(pattern_invalid, b.get_html()) f = open(allhtml_filepath, 'w') f.write(b.get_html()) f.close() if m0: print "Message %s doesn't exist"%message_id f = open(na_filepath, 'w') f.close() return pattern_content = re.compile(r'<!-- start content include -->\s(.+?)\s<!-- end content include -->', re.DOTALL) m1 = re.search(pattern_content, html) if not m1: print "invalid format: html" return email_content = m1.group(1) mysoup = BeautifulSoup(email_content) source_content = mysoup.find('td', {'class': 'source user'}).__repr__() source_content = unicode(source_content, 'utf-8', errors='replace') source_content = source_content.encode('utf-8') m2 = re.search(re.compile(r'\s+(From .+?\s*)?<br />\s+<br />\s+(.+)</td>',re.DOTALL), source_content) if not m2: print "invalid format: email_content" f = open("source_content", 'w') f.write(source_content) f.close() sys.exit(1) return email_header = m2.group(1) new_header_lines = [] for l in email_header.split('\n'): nl = re.sub(r'<a href=".+?>(.+?)<\/a>', lambda m: m.group(1), l) nl = re.sub(r'<br />$', '', nl) nl = BeautifulStoneSoup(nl, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) nl = nl.getText() new_header_lines.append(nl) email_header = '\n'.join(new_header_lines) email_body = m2.group(2) new_body_lines = [] for l in email_body.split('\n'): nl = re.sub(r'<a href=".+?>(.+?)<\/a>', lambda m: m.group(1), l) nl = re.sub(r'<br />$', '', nl) nl = BeautifulStoneSoup( nl, convertEntities=BeautifulStoneSoup.HTML_ENTITIES ) nl = nl.getText() new_body_lines.append(nl) email_body = '\n'.join(new_body_lines) f_header = open(header_filepath, 'w') f_header.write(email_header) f_header.close() f_body = open(body_filepath, 'w') email_body= email_body.encode('utf-8') f_body.write(email_body) f_body.close()