def wikipediaSearch( self, word = "iOS", lang = 'simple', maximumNumberOfResults = 1, save = False ): ''' (get) Wikipedia (corpus (documents) ) (by) Search Recibe una palabra: 'word', busca 'word' en Wikipedia y guarda los articulos en 'self.corpus' ''' wiki = Wikipedia(lang) resultadosdebusqueda = wiki.search(word , 1, maximumNumberOfResults) numerodearticulos = len(resultadosdebusqueda) for resultado in resultadosdebusqueda: try: raw = wiki.article(resultado['title']) except: raw = None if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text if save: f = open( resultado['title'] + '.txt', 'w+') f.write(content) f.close() self.documents.append(content) #os.system('clear') #Imprime avance del metodo. '''
def input_error(self): cwd = os.getcwd() configuration_file = os.path.join(cwd,"config", "app_config.json") try: wiki = Wikipedia('tests/test.input8', configuration_file) wiki.parse_input() self.assertEqual(False,"test faile") except Exception as e: print e self.assertEqual(True,True)
class WikipediaSearch(object): def __init__(self,mlDb='ml',maxResult=10, overwrite=False,targetDir='./'): """ Init method, mlDb : mongodb database maxResult : amount of result that search for each program, default 10 overwrite : boolean parameter that overwrite if program have already wikipedia results, default False targetDir: directory where save the wikipedia articles downloaded """ conn= Connection() db = conn[mlDb] self.programs = db.programs lang = 'en' self.wiki = Wikipedia(lang) self.maxResult = maxResult self.overwrite = overwrite self.targetDir = targetDir logging.info('Mongodb initialized in %s db for MovieLens' % mlDb) def fill(self,maxProgramNumber=10): """This method fill program in mongodb backend, maxProgramNumber parameter determine how many program will fill with wikipedia results """ wikipediaResultsField = 'wikipediaResults' wikipediaSelectedField = 'wikipediaSelected' for program in self.programs.find().limit(maxProgramNumber): if self.overwrite or not(wikipediaSelectedField in program): results = self.wiki.search2(program['name'].encode('utf-8'),self.maxResult) print results if len(results)>0: selected=results[0]['title'] self.programs.update({'_id':program['_id']}, {"$set": {wikipediaResultsField:results, wikipediaSelectedField:selected}}, upsert=False) def downloadArticles(self,maxProgramNumber=10): """This method fill program in mongodb backend, maxProgramNumber parameter determine how many program will fill with wikipedia results """ print 'running downloadArticles' wikipediaResultsField = 'wikipediaResults' wikipediaSelectedField = 'wikipediaSelected' for program in self.programs.find().limit(maxProgramNumber): # print program['name'] # print program['wikipediaSelected'] if wikipediaSelectedField in program: filename = program[wikipediaSelectedField].encode('utf-8').replace (" ", "").replace ("/", "").replace (":", "")+".txt" print 'writing: '+self.targetDir+filename f = open(self.targetDir+filename,'w') rawArticle = self.wiki.article(program[wikipediaSelectedField].encode('utf-8')) f.write(rawArticle) f.close()
def on_get(self, req, resp, search_term): """Handle search requests.""" w = Wikipedia() try: resp.body = json.dumps(w.search([search_term])) resp.status = falcon.HTTP_200 except Exception as e: resp.body = json.dumps({ "Error": "Something went wrong, sorry!", "Exception": e }) resp.status = falcon.HTTP_500
def add_entry(): wiki = Wikipedia(url_entry.get()) name, birthday, deathday = wiki.scrap_person() query = name + "\n" + birthday + "\n" + deathday send_person("1", query) message = recieve_message() message = message['data'].decode('utf-8') if message == "OK": messagebox.showinfo("Operation Successful!", "Person was successfully added to the database") else: messagebox.showwarning("Operation Failed.", "Person exists on the database already.")
def wiki_extract(article, lang='pt'): wiki = Wikipedia(lang) try: raw = wiki.article(article) except: raw = None content = "" if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text content_dict = {"resumo": ""} current_pointer = content_dict parent_pointer = content_dict first = True for line in content.splitlines(): line = line.strip() if line != "": if line.startswith("==") and not line.startswith("==="): title = __get_title(line) content_dict[title] = {"text": ""} parent_pointer = content_dict[title] current_pointer = content_dict[title] first = False elif line.startswith("==="): title = __get_title(line) parent_pointer[title] = {"text": ""} current_pointer = parent_pointer[title] else: if first: content_dict["resumo"] = "%s<p>%s</p>" % ( current_pointer["resumo"], line) else: current_pointer["text"] = "%s<p>%s</p>" % ( current_pointer["text"], line) return content_dict
def wiki_extract(article, lang='pt'): wiki = Wikipedia(lang) try: raw = wiki.article(article) except: raw = None content = "" if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text content_dict = {"resumo": ""} current_pointer = content_dict parent_pointer = content_dict first = True for line in content.splitlines(): line = line.strip() if line != "": if line.startswith("==") and not line.startswith("==="): title = __get_title(line) content_dict[title] = {"text": ""} parent_pointer = content_dict[title] current_pointer = content_dict[title] first = False elif line.startswith("==="): title = __get_title(line) parent_pointer[title] = {"text": ""} current_pointer = parent_pointer[title] else: if first: content_dict["resumo"] = "%s<p>%s</p>" % (current_pointer["resumo"], line) else: current_pointer["text"] = "%s<p>%s</p>" % (current_pointer["text"], line) return content_dict
def no_answer_question(self): try: cwd = os.getcwd() configuration_file = os.path.join(cwd,"config", "app_config.json") wiki = Wikipedia('tests/err', configuration_file) wiki.parse_input() wiki.calculate_and_print_answers() self.assertEqual(False,"test faile") except Exception as e: print e self.assertEqual(True,True) try: wiki = Wikipedia('tests/test.input8', configuration_file) q = QuestionAnswer('why is the earth flat?') self.assertEqual( False, q.find_answer(wiki.paragraph, wiki.appconfig) ) except Exception as e: self.assertEqual(False,"test faile")
def __init__(self,mlDb='ml',maxResult=10, overwrite=False,targetDir='./'): """ Init method, mlDb : mongodb database maxResult : amount of result that search for each program, default 10 overwrite : boolean parameter that overwrite if program have already wikipedia results, default False targetDir: directory where save the wikipedia articles downloaded """ conn= Connection() db = conn[mlDb] self.programs = db.programs lang = 'en' self.wiki = Wikipedia(lang) self.maxResult = maxResult self.overwrite = overwrite self.targetDir = targetDir logging.info('Mongodb initialized in %s db for MovieLens' % mlDb)
def testAll(self): try: cwd = os.getcwd() configuration_file = os.path.join(cwd,"config", "app_config.json") wiki = Wikipedia('test.input', configuration_file) wiki.parse_input() wiki.calculate_and_print_answers() self.assertEqual(True,True) except Exception as e: print e traceback.print_exc(file=sys.stdout) self.assertEqual(True,False)
def test_dataloader(self): return Wikipedia("TEST", self.tokenizer, transform=self.eval_transform).get_dataloader( batch_size=self.hparams.bs, shuffle=False)
def wiki(self, message): query, wiki = self.q.search(message), Wikipedia('en') try: return self.q.cut(Wiki2Plain(wiki.article(query), query).text) except: return "The Enrichment Center regrets to inform you that this next test is impossible."
''' A script used to randomly collect Wikipedia articles ''' ''' Parse command line arguments ''' parser = argparse.ArgumentParser() parser.add_argument( "time_limit", type=int, help="crawling time limit in seconds" ) parser.add_argument( "subdomain", type=str, help="crawling subdomain" ) parser.add_argument( "-s", "--summary", action="store_true", help="collect summaries instead of full articles" ) args = parser.parse_args() ''' Start crawling ''' wiki = Wikipedia(args.subdomain, args.summary) wiki.crawl(args.time_limit)
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if (not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if (not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath, int(so_uid))) if not os.path.isfile(filepath): queue.put( ('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace( pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2 / F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i < nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))
from wikipedia import Wikipedia import argparse ''' A script used to randomly collect Wikipedia articles ''' ''' Parse command line arguments ''' parser = argparse.ArgumentParser() parser.add_argument("how_many_pages", type=int, help="crawling articles limit") parser.add_argument("subdomain", type=str, help="crawling subdomain") args = parser.parse_args() ''' Start crawling ''' wiki = Wikipedia(args.subdomain) wiki.crawl(args.how_many_pages)
def val_dataloader(self): print("Using Wikipedia") return Wikipedia("VAL", self.tokenizer, transform=self.eval_transform).get_dataloader( batch_size=self.hparams.bs, shuffle=False)
def train_dataloader(self): print("Using Wikipedia") return Wikipedia("TRAIN", self.tokenizer, transform=self.train_transform).get_dataloader( batch_size=self.hparams.bs, shuffle=True)
from wikipedia import Wikipedia from wiki2plain import Wiki2Plain import io content = {} lang = 'simple' wiki = Wikipedia(lang) try: raw = wiki.article('Arizona') except: raw = None if raw: wiki2plain = Wiki2Plain(raw) content = wiki2plain.text print content model_file = io.open("per.txt", "wb") model_file.write("" + content) model_file.close() model_file1 = io.open("per1.txt", "wb") i = 0 with open("per.txt", "r") as f: while (i < 2): line = f.readline() if "{" in line or "|" in line or "}" in line: print("") else: i += 1 model_file1.write("" + line)
from wikipedia import Wikipedia from Wiki2Plain import Wiki2Plain if __name__ == '__main__': lang = 'simple' wiki = Wikipedia(lang) try: raw = wiki.article('Uruguay') print(raw) except: raw = None if raw: wiki2plain = Wiki2Plain(bytes(raw).decode("utf-8")) content = wiki2plain.text print(content)
def __init__(self): self.wikipedia_caller = Wikipedia() self.wikidata_caller = Wikidata() self._cate_info = {}
answer = row['answerB'] elif row['correctAnswer'] == 'C': answer = row['answerC'] else: answer = row['answerD'] if answer not in AnswerSet: AnswerSet.append(answer) for row in DictReader(open('sci_test.csv')): for choice in ['answerA', 'answerB', 'answerC', 'answerD']: if row[choice] not in AnswerSet: AnswerSet.append(row[choice]) lang = 'simple' wiki = Wikipedia(lang) counts = 0 n_answer = 0 o = DictWriter(open("wiki.csv", 'wb'), ["answer", "question"]) o.writeheader() counts = 0 for answer in AnswerSet: print n_answer n_answer += 1 try: raw = wiki.article(answer) except: raw = None
class WikiCategory(object): """ the class used to find categories for entities """ def __init__(self): self.wikipedia_caller = Wikipedia() self.wikidata_caller = Wikidata() self._cate_info = {} def _get_single_entity_cate(self,entity): """ find the category info for a single entity Updates ----------------------------------------- cate_info: a dict categories of the entities key: entity name value: a list of categories of the key """ if entity in self._cate_info: return else: self._cate_info[entity] = [] try: entity_name = self.wikipedia_caller.get_entity_name(entity) except wikiexceptions.ResultErrorException: print "cannot find an entity for name %s" %entity self._cate_info[entity] = None try: entity_info = self.wikidata_caller.get_entity_info_by_name(entity_name) except wikiexceptions.NoClassException: print "entity %s has no class info" %entity_name self._cate_info[entity] = None else: for cid in entity_info['class_info']: self._cate_info[entity].append(entity_info['class_info'][cid]) def _get_cate_for_entity_iterable(self,entity_iterable): """ find the categories info for an Iterable of entities Updates ----------------------------------------- cate_info: a dict categories of the entities key: entity name value: a list of categories of the key """ for entity in entity_iterable: self._get_single_entity_cate(entity) time.sleep(5) def get_cates(self,entitiy_input): """ Find the categories info for entities. Do type check first and support both string and Iterable(except dict) input Updates: ----------------------------------------- cate_info: a dict categories of the entities key: entity name value: a list of categories of the key """ input_type = type(entitiy_input) if isinstance(entitiy_input,Iterable): if type(entitiy_input) == str: self._get_single_entity_cate(entitiy_input) elif input_type == dict: raise TypeError("unsupported type %s" %(input_type) ) else: self._get_cate_for_entity_iterable(entitiy_input) else: raise TypeError("unsupported type %s" %(input_type) ) @property def cate_info(self): return self._cate_info
from wikipedia import isQuestion, Wikipedia browser1 = Browser() speaker1 = Speaker() userCommand = "" while (True): speaker1.printAndSpeak("How may I help you, sir?") # userCommand = handleSpeech(userCommand) userCommand = input("Enter your command : ") print("Your command :", userCommand) if ("bye" in userCommand.lower()): speaker1.speak("Bye sir, see you next time") break elif (userCommand == ""): speaker1.printAndSpeak("Sorry sir, couldn't understand audio") elif (userCommand.lower() == "how are you doing"): speaker1.speak( random.choice( ["I am fine", "Incredible, Sir", "I am feeling great"])) elif ("open" in userCommand.lower()): browser1.open(userCommand) elif ("weather" in userCommand.lower()): displayWeather(userCommand) elif (isQuestion(userCommand)): wiki = Wikipedia(userCommand) wiki.findKeyword() if (wiki.doesPageExist()): wiki.displaySummary() else: speaker1.printAndSpeak("Sorry sir, couldn't get information ")
def main(): data = "../resources/SOusers-Mar13.csv" # File containing SO user dump results = "../resources/features3.csv" # File where features will be stored picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded fr = open(os.path.join(data), 'rb') fw = open(os.path.join(results), 'ab') if _RANDOM: reader = RandomReader(fr) else: reader = UnicodeReader(fr) writer = UnicodeWriter(fw) queue = Queue() if _FACE: faceDetector = FaceDetector() threads = [] SOhashes = {} # Dictionary of user's hashes # Use multiple threads to download and get information for i in xrange(10): threads.append(Downloader(queue)) threads[-1].start() idx = 0 size = 4500 # Number of subjects for row in reader: if idx < size: so_uid = row[0] so_hash = row[2] if(not (SOhashes.has_key(so_hash))): SOhashes[so_hash] = so_uid if(not isDefaultGravatarPic(so_hash)): data = [so_uid] if _VISUAL_FEATURES: # Download picture filepath = os.path.join('%s%d.jpg' % (picPath,int(so_uid))) if not os.path.isfile(filepath): queue.put(('http://www.gravatar.com/avatar/%s' % so_hash, filepath)) time.sleep(2) # Load picture pic = picUtils.loadPicture(filepath) if _FACE: if faceDetector.isFrontFace(pic) or faceDetector.isProfileFace(pic): data.append(str(True)) else: data.append(str(False)) if _MOST_COMMON_COLORS: _, f1, _, f2 = picUtils.mostCommonColor(pic) data.append(str(f1 + f2)) if _NBCOLORS: data.append(str(picUtils.getNbOfColors(pic))) if _FARTHEST_NEIGHBOR: F1 = picUtils.farthestNeighborMetric(pic, 10) F2 = picUtils.farthestNeighborMetric(pic, 200) data.append(str(F1)) data.append(str(F2)) if F1 != 0: data.append(str(F2/F1)) else: data.append('?') if _AVERAGE_SATURATION: data.append(str(picUtils.avgSaturation(pic))) if _THRESHOLD_BRIGHTNESS: data.append(str(picUtils.threBrightness(pic, 0.2))) if _GOOGLE: gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash) bestGuess = gi.getBestGuess() if bestGuess: bestGuess = bestGuess.encode('utf8') data.append(bestGuess) if _WIKIPEDIA: gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess) wikiTitlePage = gs.getWikipediaTitlePage() if wikiTitlePage: wiki = Wikipedia(wikiTitlePage) wiki.categoryGraph(4) nbCats = 10 i = 0 cats = wiki.sortGraphByDegree() while i<nbCats and i < len(cats): data.append(str(cats[i])) i += 1 # Write all information collected in the csv file try: print data writer.writerow(data) idx += 1 except: print "Error with data" else: break fr.close() fw.close() # If here, download finished. Stop threads for i in xrange(10): queue.put((None, None))
import sys from wikipedia import Wikipedia from wiki2plain import Wiki2Plain lang = 'en' wiki = Wikipedia(lang) try: articleName = str(sys.argv[1])#'Uruguay' raw = wiki.article(articleName) except: raw = None if raw: wiki2plain = Wiki2Plain(raw) f = open('../corpus/' + articleName + '.txt', 'w') f.write(wiki2plain.text) #content = wiki2plain.text #print(wiki2plain.text)
#!/usr/bin/python import sys import os import traceback from wikipedia import Wikipedia if __name__ == '__main__': if len(sys.argv) < 2: print "usage : python main.py <input-file-name>" sys.exit(1) input_filename = sys.argv[1] cwd = os.getcwd() configuration_file = os.path.join(cwd,"config", "app_config.json") if not os.path.isfile(configuration_file): print "cannot access file ", configuration_file sys.exit(1) try: wiki = Wikipedia(input_filename, configuration_file) wiki.parse_input() wiki.calculate_and_print_answers() except Exception as e: print e #traceback.print_exc(file=sys.stdout)