def wikipediaSearch( self, word = "iOS", lang = 'simple', maximumNumberOfResults = 1, save = False ):
     '''
         (get) Wikipedia (corpus (documents) ) (by) Search
         Recibe una palabra: 'word', busca 'word' en Wikipedia y guarda los articulos en 'self.corpus'
         '''
     wiki = Wikipedia(lang)
     
     resultadosdebusqueda = wiki.search(word , 1, maximumNumberOfResults)
     
     numerodearticulos = len(resultadosdebusqueda)
     
     for resultado in resultadosdebusqueda:
         try:
             raw = wiki.article(resultado['title'])
         except:
             raw = None
         if raw:
             wiki2plain = Wiki2Plain(raw)
             content = wiki2plain.text
             if save:
                 f = open( resultado['title'] + '.txt', 'w+')
                 f.write(content)
                 f.close()
             self.documents.append(content)
         #os.system('clear')
         
         #Imprime avance del metodo.
         '''
Exemple #2
0
class WikipediaSearch(object):
    def __init__(self,mlDb='ml',maxResult=10, overwrite=False,targetDir='./'):
        """
            Init method,
            mlDb : mongodb database
            maxResult : amount of result that search for each program, default 10
            overwrite : boolean parameter that overwrite if program have already wikipedia results, default False
            targetDir: directory where save the wikipedia articles downloaded
        """
        conn= Connection()
        db = conn[mlDb]
        self.programs = db.programs
        lang = 'en'
        self.wiki = Wikipedia(lang)
        self.maxResult = maxResult
        self.overwrite = overwrite
        self.targetDir = targetDir
        logging.info('Mongodb initialized in %s db for MovieLens' % mlDb)


    def fill(self,maxProgramNumber=10):
        """This method fill program in mongodb backend, maxProgramNumber parameter
        determine how many program will fill with wikipedia results
        """
        wikipediaResultsField = 'wikipediaResults'
        wikipediaSelectedField = 'wikipediaSelected'
        for program in self.programs.find().limit(maxProgramNumber):
            if self.overwrite  or not(wikipediaSelectedField in program):
                results = self.wiki.search2(program['name'].encode('utf-8'),self.maxResult)
                print results

                if len(results)>0:
                    selected=results[0]['title']
                    self.programs.update({'_id':program['_id']}, {"$set": {wikipediaResultsField:results, wikipediaSelectedField:selected}}, upsert=False)


    def downloadArticles(self,maxProgramNumber=10):
            """This method fill program in mongodb backend, maxProgramNumber parameter
            determine how many program will fill with wikipedia results
            """
            print 'running downloadArticles'
            wikipediaResultsField = 'wikipediaResults'
            wikipediaSelectedField = 'wikipediaSelected'
            for program in self.programs.find().limit(maxProgramNumber):
                # print program['name']
                # print program['wikipediaSelected']
                if wikipediaSelectedField in program:
                    filename = program[wikipediaSelectedField].encode('utf-8').replace (" ", "").replace ("/", "").replace (":", "")+".txt"
                    print 'writing: '+self.targetDir+filename
                    f = open(self.targetDir+filename,'w')
                    rawArticle = self.wiki.article(program[wikipediaSelectedField].encode('utf-8'))
                    f.write(rawArticle)
                    f.close()
Exemple #3
0
def wiki_extract(article, lang='pt'):

    wiki = Wikipedia(lang)
    try:
        raw = wiki.article(article)
    except:
        raw = None

    content = ""

    if raw:
        wiki2plain = Wiki2Plain(raw)
        content = wiki2plain.text

    content_dict = {"resumo": ""}
    current_pointer = content_dict
    parent_pointer = content_dict
    first = True
    for line in content.splitlines():
        line = line.strip()
        if line != "":
            if line.startswith("==") and not line.startswith("==="):
                title = __get_title(line)
                content_dict[title] = {"text": ""}
                parent_pointer = content_dict[title]
                current_pointer = content_dict[title]
                first = False
            elif line.startswith("==="):
                title = __get_title(line)
                parent_pointer[title] = {"text": ""}
                current_pointer = parent_pointer[title]
            else:
                if first:
                    content_dict["resumo"] = "%s<p>%s</p>" % (
                        current_pointer["resumo"], line)
                else:
                    current_pointer["text"] = "%s<p>%s</p>" % (
                        current_pointer["text"], line)

    return content_dict
def wiki_extract(article, lang='pt'):

    wiki = Wikipedia(lang)
    try:
        raw = wiki.article(article)
    except:
        raw = None

    content = ""

    if raw:
        wiki2plain = Wiki2Plain(raw)
        content = wiki2plain.text

    content_dict = {"resumo": ""}
    current_pointer = content_dict
    parent_pointer = content_dict
    first = True
    for line in content.splitlines():
        line = line.strip()
        if line != "":
            if line.startswith("==") and not line.startswith("==="):
                title = __get_title(line)
                content_dict[title] = {"text": ""}
                parent_pointer = content_dict[title]
                current_pointer = content_dict[title]
                first = False
            elif line.startswith("==="):
                title = __get_title(line)
                parent_pointer[title] = {"text": ""}
                current_pointer = parent_pointer[title]
            else:
                if first:
                    content_dict["resumo"] = "%s<p>%s</p>" % (current_pointer["resumo"], line)
                else:
                    current_pointer["text"] = "%s<p>%s</p>" % (current_pointer["text"], line)


    return content_dict
import sys
from wikipedia import Wikipedia
from wiki2plain import Wiki2Plain

lang = 'en'
wiki = Wikipedia(lang)

try:
    articleName = str(sys.argv[1])#'Uruguay'
    raw = wiki.article(articleName)
except:
    raw = None

if raw:
    wiki2plain = Wiki2Plain(raw)
    f = open('../corpus/' + articleName + '.txt', 'w')
    f.write(wiki2plain.text)
    #content = wiki2plain.text
    #print(wiki2plain.text)

lang = 'simple'
wiki = Wikipedia(lang)

counts = 0
n_answer = 0
o = DictWriter(open("wiki.csv", 'wb'), ["answer",  "question"])
o.writeheader()

counts = 0
for answer in AnswerSet:
	print n_answer
	n_answer += 1
	try:
	    raw = wiki.article(answer)
	except:
		raw = None

	if raw:
		question = Wiki2Plain(raw).text.split('\n')[0]
		d = {'answer': answer, 'question': question}
		o.writerow(d)

	print counts
	counts += 1


# counts = 0
# total = 0
# for row in DictReader(open('sci_train.csv')):
Exemple #7
0
from wikipedia import Wikipedia
from Wiki2Plain import Wiki2Plain

if __name__ == '__main__':

    lang = 'simple'
    wiki = Wikipedia(lang)

    try:
        raw = wiki.article('Uruguay')
        print(raw)
    except:
        raw = None

    if raw:
        wiki2plain = Wiki2Plain(bytes(raw).decode("utf-8"))
        content = wiki2plain.text
        print(content)
Exemple #8
0
from wikipedia import Wikipedia
from wiki2plain import Wiki2Plain
import io

content = {}
lang = 'simple'
wiki = Wikipedia(lang)

try:
    raw = wiki.article('Arizona')
except:
    raw = None

if raw:
    wiki2plain = Wiki2Plain(raw)
    content = wiki2plain.text

print content
model_file = io.open("per.txt", "wb")
model_file.write("" + content)
model_file.close()
model_file1 = io.open("per1.txt", "wb")
i = 0
with open("per.txt", "r") as f:
    while (i < 2):
        line = f.readline()
        if "{" in line or "|" in line or "}" in line:
            print("")
        else:
            i += 1
            model_file1.write("" + line)