def wikipediaSearch( self, word = "iOS", lang = 'simple', maximumNumberOfResults = 1, save = False ): ''' (get) Wikipedia (corpus (documents) ) (by) Search Recibe una palabra: 'word', busca 'word' en Wikipedia y guarda los articulos en 'self.corpus' ''' wiki = Wikipedia(lang) resultadosdebusqueda = wiki.search(word , 1, maximumNumberOfResults) numerodearticulos = len(resultadosdebusqueda) for resultado in resultadosdebusqueda: try: raw = wiki.article(resultado['title']) except: raw = None if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text if save: f = open( resultado['title'] + '.txt', 'w+') f.write(content) f.close() self.documents.append(content) #os.system('clear') #Imprime avance del metodo. '''
class WikipediaSearch(object): def __init__(self,mlDb='ml',maxResult=10, overwrite=False,targetDir='./'): """ Init method, mlDb : mongodb database maxResult : amount of result that search for each program, default 10 overwrite : boolean parameter that overwrite if program have already wikipedia results, default False targetDir: directory where save the wikipedia articles downloaded """ conn= Connection() db = conn[mlDb] self.programs = db.programs lang = 'en' self.wiki = Wikipedia(lang) self.maxResult = maxResult self.overwrite = overwrite self.targetDir = targetDir logging.info('Mongodb initialized in %s db for MovieLens' % mlDb) def fill(self,maxProgramNumber=10): """This method fill program in mongodb backend, maxProgramNumber parameter determine how many program will fill with wikipedia results """ wikipediaResultsField = 'wikipediaResults' wikipediaSelectedField = 'wikipediaSelected' for program in self.programs.find().limit(maxProgramNumber): if self.overwrite or not(wikipediaSelectedField in program): results = self.wiki.search2(program['name'].encode('utf-8'),self.maxResult) print results if len(results)>0: selected=results[0]['title'] self.programs.update({'_id':program['_id']}, {"$set": {wikipediaResultsField:results, wikipediaSelectedField:selected}}, upsert=False) def downloadArticles(self,maxProgramNumber=10): """This method fill program in mongodb backend, maxProgramNumber parameter determine how many program will fill with wikipedia results """ print 'running downloadArticles' wikipediaResultsField = 'wikipediaResults' wikipediaSelectedField = 'wikipediaSelected' for program in self.programs.find().limit(maxProgramNumber): # print program['name'] # print program['wikipediaSelected'] if wikipediaSelectedField in program: filename = program[wikipediaSelectedField].encode('utf-8').replace (" ", "").replace ("/", "").replace (":", "")+".txt" print 'writing: '+self.targetDir+filename f = open(self.targetDir+filename,'w') rawArticle = self.wiki.article(program[wikipediaSelectedField].encode('utf-8')) f.write(rawArticle) f.close()
def wiki_extract(article, lang='pt'): wiki = Wikipedia(lang) try: raw = wiki.article(article) except: raw = None content = "" if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text content_dict = {"resumo": ""} current_pointer = content_dict parent_pointer = content_dict first = True for line in content.splitlines(): line = line.strip() if line != "": if line.startswith("==") and not line.startswith("==="): title = __get_title(line) content_dict[title] = {"text": ""} parent_pointer = content_dict[title] current_pointer = content_dict[title] first = False elif line.startswith("==="): title = __get_title(line) parent_pointer[title] = {"text": ""} current_pointer = parent_pointer[title] else: if first: content_dict["resumo"] = "%s<p>%s</p>" % ( current_pointer["resumo"], line) else: current_pointer["text"] = "%s<p>%s</p>" % ( current_pointer["text"], line) return content_dict
def wiki_extract(article, lang='pt'): wiki = Wikipedia(lang) try: raw = wiki.article(article) except: raw = None content = "" if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text content_dict = {"resumo": ""} current_pointer = content_dict parent_pointer = content_dict first = True for line in content.splitlines(): line = line.strip() if line != "": if line.startswith("==") and not line.startswith("==="): title = __get_title(line) content_dict[title] = {"text": ""} parent_pointer = content_dict[title] current_pointer = content_dict[title] first = False elif line.startswith("==="): title = __get_title(line) parent_pointer[title] = {"text": ""} current_pointer = parent_pointer[title] else: if first: content_dict["resumo"] = "%s<p>%s</p>" % (current_pointer["resumo"], line) else: current_pointer["text"] = "%s<p>%s</p>" % (current_pointer["text"], line) return content_dict
import sys from wikipedia import Wikipedia from wiki2plain import Wiki2Plain lang = 'en' wiki = Wikipedia(lang) try: articleName = str(sys.argv[1])#'Uruguay' raw = wiki.article(articleName) except: raw = None if raw: wiki2plain = Wiki2Plain(raw) f = open('../corpus/' + articleName + '.txt', 'w') f.write(wiki2plain.text) #content = wiki2plain.text #print(wiki2plain.text)
lang = 'simple' wiki = Wikipedia(lang) counts = 0 n_answer = 0 o = DictWriter(open("wiki.csv", 'wb'), ["answer", "question"]) o.writeheader() counts = 0 for answer in AnswerSet: print n_answer n_answer += 1 try: raw = wiki.article(answer) except: raw = None if raw: question = Wiki2Plain(raw).text.split('\n')[0] d = {'answer': answer, 'question': question} o.writerow(d) print counts counts += 1 # counts = 0 # total = 0 # for row in DictReader(open('sci_train.csv')):
from wikipedia import Wikipedia from Wiki2Plain import Wiki2Plain if __name__ == '__main__': lang = 'simple' wiki = Wikipedia(lang) try: raw = wiki.article('Uruguay') print(raw) except: raw = None if raw: wiki2plain = Wiki2Plain(bytes(raw).decode("utf-8")) content = wiki2plain.text print(content)
from wikipedia import Wikipedia from wiki2plain import Wiki2Plain import io content = {} lang = 'simple' wiki = Wikipedia(lang) try: raw = wiki.article('Arizona') except: raw = None if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text print content model_file = io.open("per.txt", "wb") model_file.write("" + content) model_file.close() model_file1 = io.open("per1.txt", "wb") i = 0 with open("per.txt", "r") as f: while (i < 2): line = f.readline() if "{" in line or "|" in line or "}" in line: print("") else: i += 1 model_file1.write("" + line)