Esempio n. 1
0
    def search_wiki(self, k):  # scrape query's wikipedia article
        article = Wikipedia().search(k)
        if (article == None):
            try:
                ny = wikipedia.page(k)
            except requests.exceptions.ConnectionError as e:
                return 'ReStArT'


#            except wikipedia.exceptions.HTTPTimeoutError as e:
#                print('here1')
#                print(str(e))
#            except requests.exceptions.ConnectTimeout as e:
#                print('here2')
#                print(str(e))
            except wikipedia.exceptions.DisambiguationError as e:
                try:
                    ny = wikipedia.page(e.options[0])
                except requests.exceptions.ConnectionError as e:
                    return 'ReStArT'
                except wikipedia.exceptions.DisambiguationError as e:
                    return -5
            except wikipedia.exceptions.PageError as e:
                return -5
            article = Wikipedia().search(ny.title)

        contents = [section.content for section in article.sections]
        kk = 0
        for content in contents:
            if (len(content) == 0):
                kk = kk + 1
        if (contents == [] or kk == len(contents)):
            try:
                ny = wikipedia.page(k)
            except requests.exceptions.ConnectionError as e:
                return 'ReStArT'
            except wikipedia.exceptions.DisambiguationError as e:
                try:
                    ny = wikipedia.page(e.options[0])
                except requests.exceptions.ConnectionError as e:
                    return 'ReStArT'
                except wikipedia.exceptions.DisambiguationError as e:
                    return -5
            content = ny.content
            if (len(content) == 0):
                return -5
            else:
                contents = []
                contents.append(content)
        d = []
        for content in contents:
            a = content.split()
            d.append(a)
        content = [
            j for i in d for j in i
            if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1
        ]  # take only meaningful content
        self.content = ' '.join(content)
        return self.content
Esempio n. 2
0
 def get_articles(self, start, depth=1, max_count=1):
     article = Wikipedia().article(start)
     links = article.links
     list_of_strings = []
     for link in links:
         text = Wikipedia().article(link)
         text = self.text_cleaning(plaintext(text.source))
         list_of_strings.append(text)
     return list_of_strings
Esempio n. 3
0
    def get_articles(self, start):

        article = Wikipedia().article(start)
        links = article.links
        list_of_strings = []
        for l in links:
            l_art = Wikipedia().article(l)
            l_text =  re.sub(' +', ' ', plaintext(l_art.source), flags = re.DOTALL)
            l_text = re.sub('[.;:!?\'\"-\(\)1234567890@,]+', '', l_text, flags = re.DOTALL).lower()
            list_of_strings.append(l_text.split())

        return list_of_strings
Esempio n. 4
0
def testWikipedia(palabra):
    """
    Obtiene el articulo de la wikipedia
    """
    engine = Wikipedia(license=None, throttle=5.0)
    resultados = engine.search(palabra)

    print resultados.plaintext()
    return resultados
Esempio n. 5
0
 def get_articles(self, start):
     arr = []
     wiki = Wikipedia(language="en")
     article = wiki.article(start)
     arr.append(self.text_cleaning(article))
     for title in article.links:
         article = wiki.article(title)
         arr.append(self.text_cleaning(article))
     return arr
def getFeaturedList():
    wiki = Wikipedia()
    article = wiki.search("Wikipedia:Featured articles")
    file = open("articalsTitle.txt", 'w')
    for section in article.sections:
        if section.string != "":
            for title in article.string.split('\n'):
                file.write(((str)(title)).strip() + "\n")
    file.close()
Esempio n. 7
0
    def get_articles(self, start):
        list_of_strings = []
        article = Wikipedia().article(start).plaintext()
        list_of_strings.append(self.plain_text(article))

        #for link in Wikipedia().article(start).links:
        #    if Wikipedia().article(link).language is 'en':
        #        list_of_strings.append(self.plain_text(link))

        return list_of_strings
Esempio n. 8
0
 def get_articles(self, start):
     from pattern.web import Wikipedia, plaintext
     file = open(start+'.txt', 'w')
     file.write('')
     file.close()
     list_of_strings = []
     start_article  = Wikipedia().article(start)
     links = set(start_article.links)
     for link in start_article.links:
         cur_article = Wikipedia().article(link)
         if cur_article != None:
             newtext = plaintext(cur_article.source)
             newtext = self.normalize(newtext)
             list_of_strings.append(newtext)
             file = open(start+'.txt', 'a')
             file.write(newtext.encode('utf-8'))
             file.write('\n\n-----\n\n')
             file.close()
     return list_of_strings
Esempio n. 9
0
 def show_results(self):
     parser = WikiParser()
     articles = parser.get_articles('Natural language processing')
     art_statistics = TextStatistics(articles)
     print('топ-20 3-грамм по корпусу статей\r\n' + art_statistics.get_top_3grams(20))
     print('топ-20 слов по корпусу статей\r\n' + art_statistics.get_top_words(20))
     
     article = Wikipedia().article('Natural language processing')
     a_statistics = TextStatistics(article)
     print('топ-5 3-грамм по самой статье \r\n' + a_statistics.get_top_3grams(5))
     print('топ-5 слов по самой статье \r\n' + a_statistics.get_top_words(5))
Esempio n. 10
0
def wiki(titulo):
    article = Wikipedia(language="en").search(query=titulo)
    result=[]
    if article:
        for section in article.sections:
            result.append(section.title+"\n"+section.string+"\n")
            for link in section.links:
                result.append(link+"\n")

    return result
        
    
def getFeaturedContent():
    wiki = Wikipedia()
    list = open("articalsTitle.txt", 'r')
    file = open("wikiData.txt", 'w')
    for i in range(2000):
        title = list.readline().replace("\n", "")
        article = wiki.search(title)
        if article is not None:
            for section in article.sections:
                if section.string != "":
                    file.write(section.string + "\n")
            time.sleep(0.2)
            print(title + " Get! " + str(i) + "/2000")
Esempio n. 12
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
Esempio n. 13
0
    def get_articles(self, search_term):
        text_list = []

        article = Wikipedia().search(search_term)
        links = article.links

        s = self.__get_text(search_term)
        text_list.append(s)

        for link in links:
            text_list.append(self.__get_text(link))
            print(link)

        return text_list
Esempio n. 14
0
def gender(name):
    """ Returns the gender of the given person (m/f).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        s = plaintext(p.string)
        s = s.lower()
        s = s.replace("\n", "\n ")
        m = sum(s.count(" %s " % x) for x in ("he", "his"))
        f = sum(s.count(" %s " % y) for y in ("she", "her"))
        g = m > f and "m" or "f"  # More "he" or more "she"?
        return g
    except:
        return None
Esempio n. 15
0
 def search_wiki(self, k):  # scrape query's wikipedia article
     article = Wikipedia().search(k)
     contents = [
         section.content.encode("utf8") for section in article.sections
     ]
     d = []
     for content in contents:
         a = content.split()
         d.append(a)
     content = [
         j for i in d for j in i
         if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1
     ]  # take only meaningful content
     self.content = ' '.join(content)
     return self.content
Esempio n. 16
0
def get_info(search_query):
	if isinstance(search_query, str):
		search_query = str(search_query)
	else:
		return { "Error": "Pass a string, from mine.py [7]", "Result": [None] }

	result = []
	engineGoogle = Google(license=None, throttle=0.5, language=None)
	engineBing = Bing(license=None, throttle=0.5, language=None)
	engineTwitter = Twitter(license=None, throttle=0.5, language=None)
	engineFacebook = Facebook(license=None, throttle=1.0, language='en')
	engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None)
	engineFlickr = Flickr(license=None, throttle=5.0, language=None)
	engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr]
	engineArray = [engineGoogle, engineTwitter]

	'''
	for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engine[0].search(search_query, type=SEARCH, start=i, count=5)])
		[result.append([result.append(repr(plaintext(para.text))) for para in engine.search(search_query, type=SEARCH, start=i, count=5)]) for engine in engineArray]
			# print repr(plaintext(para.text))
			# print repr(plaintext(para.url)) + '\n\n'
			# result.append(repr(plaintext(para.text)))
	'''

	# Google
	for i in range(1, 5):
		result = result + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)])
		
	for i in range(1, 5):
		result = result + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	'''
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineBing.search(search_query, type=SEARCH, start=i, count=5)])
	for i in range(1,2):
		result = result + ([repr(plaintext(para.text)) for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFacebook.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineWikipedia.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFlickr.search(search_query, type=SEARCH, start=i, count=5)])
	'''

	return { "Error": None, "Result": result }

	# return { "Error": None, "Result": ['Hello World', 'Bye Bye Tommy'] }
Esempio n. 17
0
def crawl_wiki():
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords()
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub(
                            '-', ' ',
                            keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub(
                        '-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/txt/wiki.pkl')
    return wikis
Esempio n. 18
0
def isnews(topic):
    engine = Wikipedia()
    result = engine.search(topic)
    if result:
        if topic.lower() not in result.title.lower():
            return False
        newsthings = [
            'places', 'cities', 'capitals', 'countries', 'people', 'wars'
        ]
        categories = result.categories
        for category in categories:
            for thing in newsthings:
                if thing in category.lower():
                    return True
        return False
    else:
        return False
Esempio n. 19
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    # Extract the links from this page, excluding links in the footnotes section,
    # or links to band names (we just want names of individuals).
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                # Yes = * [Rob Zombie], musician and filmmaker
                # No  = * Mark White, bass guitarist for [Spin Doctors]
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
Esempio n. 20
0
def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)"  # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0]  # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r)  # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)"  # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None
Esempio n. 21
0
    def __get_text(self, search_term):

        article = Wikipedia().search(search_term)
        if article:
            s = article.string
            if s:
                while '\n' in s:
                    s = s.replace('\n', ' ')
                while '  ' in s:
                    s = s.replace('  ', ' ')

                s = re.sub(r'[^\w\s]', '', s)
                s = s.lower()
                return s

            else:
                return None

        else:
            return None
Esempio n. 22
0
    def __init__(self):
        from pattern.web import Wikipedia, plaintext
        import string
        import re
        self.parser = WikiParser()
        self.articles = self.parser.get_articles('Natural_language_processing')
        self.stats = TextStatistics(self.articles)
        self.grams = self.stats.get_top_3grams()
        self.words = self.stats.get_top_words()
        self.top_grams_out = [' - '.join([self.grams[0][i], str(self.grams[1][i])]) for i in range(20)]
        self.top_grams = self.grams[0][:20]
        self.top_words_out = [' - '.join([self.words[0][i], str(self.words[1][i])]) for i in range(20)]
        self.top_words = self.words[0][:20]

        nlp = Wikipedia().article('Natural_language_processing')
        nlptext = plaintext(nlp.source)
        parse = WikiParser()
        nlptext = parse.normalize(nlptext)
        self.nlp_stats = TextStatistics([nlptext])
        self.nlp_top_grams = [' - '.join([self.nlp_stats.get_top_3grams()[0][i], str(self.nlp_stats.get_top_3grams()[1][i])]) for i in range(20)][:5]
        self.nlp_top_words = [' - '.join([self.nlp_stats.get_top_words()[0][i], str(self.nlp_stats.get_top_words()[1][i])]) for i in range(20)][:5]
Esempio n. 23
0
    def get_wiki_article(self,
                         search,
                         separate_in_section=False,
                         type_of_text=u'Plain text'):
        segments = list()
        article = Wikipedia(language=self.dico_lang[self.language]).search(
            search, cached=False)
        if article:
            if separate_in_section:
                for section in article.sections:
                    if type_of_text == u'Plain text':
                        wiki_article = Input(section.string)
                    else:
                        wiki_article = Input(section.html)

                    annotations = {
                        'source': 'Wikipedia',
                        'section title': section.title,
                        'section level': section.level,
                        'search': search,
                    }
                    segment = wiki_article[0]
                    segment.annotations.update(annotations)
                    wiki_article[0] = segment
                    segments.append(wiki_article)
            else:
                if type_of_text == u'Plain text':
                    wiki_article = Input(article.string)
                else:
                    wiki_article = Input(article.html)
                annotations = {
                    'source': 'Wikipedia',
                    'search': search,
                }
                segment = wiki_article[0]
                segment.annotations.update(annotations)
                wiki_article[0] = segment
                segments.append(wiki_article)
        return segments
Esempio n. 24
0
def Avg(s):
    ''' The Averaging Function
        Takes in a string, searches for the Wiki article
        and then returns the average length of words.
        No there is no error handling.
    '''
    length = 0
    numb = 0

    #
    if s in db:
        return float(db[s])
    article = Wikipedia().search(s)
    for i in xrange(len(article.sections)):
        if article.sections[i].title == "See also":
            break
        text = article.sections[i].content.replace('\n', ' ')
        for words in text.split(' '):
            numb += 1
            numbtmp += 1
    db[s] = str(float(length) / float(numb))
    return float(length) / float(numb)
Esempio n. 25
0
"""
Retrieves an article from Wikipedia (http://en.wikipedia.org).
"""
from pattern.web import Wikipedia

ENGINE = Wikipedia(language="en")


def main(search_query):
    """ Returns Twitter Search Results
    :param search_query: (str)
    """
    final = "Wikipedia Search Results:"
    article = ENGINE.search(search_query, cached=True, timeout=30)

    print article.title  # Article title (may differ from the search query).
    print ""
    print article.languages[
        "tr"]  # Article in Turkish, can be retrieved with Wikipedia(language="tr").
    print article.links[:10]  # List of linked Wikipedia articles.
    print article.external[:5]  # List of external URL's.
    print ""

    for s in article.sections:
        final = final + "\n\n" + s.title.upper() + "\n" + s.content

    return final


if __name__ == '__main__':
    main("")
Esempio n. 26
0
from __future__ import print_function
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Wikipedia

# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# Wikipedia queries request the article HTML source from the server. This can be slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().

engine = Wikipedia(language="en")

# Contrary to the other search engines in the pattern.web module,
# Wikipedia simply returns one WikipediaArticle object (or None),
# instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

# Article title (may differ from the search query).
print(article.title)
print()
# Article in French, can be retrieved with Wikipedia(language="fr").
print(article.languages["fr"])
print(article.links[:10], "...")  # List of linked Wikipedia articles.
print(article.external[:5], "...")  # List of external URL's.
print()

# print article.source # The full article content as HTML.
# print article.string # The full article content, plain text with HTML
# tags stripped.
# Word frequency.

from pattern.web import Wikipedia
from pattern.vector import words

frequency = {}

# Spreading activation.
# Parse links from seed article & visit those articles.

links, seen = set(["Italia"]), {}

while len(links) > 0:

    try:
        article = Wikipedia(language="it").search(links.pop(), throttle=10)
        seen[article.title] = True

        # Parse links from article.

        for link in article.links:

            if link not in seen:
                links.add(link)

        # Parse words from article. Count words.

        for word in words(article.string):

            if word not in frequency:
                frequency[word] = 0
Esempio n. 28
0
def main():
    article = Wikipedia(language="en").search('small', throttle=10)
    print article.string
Esempio n. 29
0
def descarga(titulo):
    engine = Wikipedia(language="en")
    result= engine.search(titulo, type=SEARCH)
    return repr(plaintext(result.string))
Esempio n. 30
0
from pattern.web import Wikipedia
import re
from string import punctuation
from pymorphy2 import MorphAnalyzer
import pandas as pd

safe = lambda x: 0 if x < 0 else x


wiki = Wikipedia(language="ru")
punc = re.compile('[%s]' % re.escape(punctuation))
start_points = ["Проектирование", "Номинативная_конструкция"]
lemma = "конструкция"
translator = str.maketrans('', '', punctuation)
regex = re.compile("[%s0-9]" % (punctuation + "—"))
analyzer = MorphAnalyzer()
stoplinks = ["Википедия", "Английский"]
with open("stop-words-russian.txt") as io:
    stopwords = [x.strip() for x in io.readlines()]


def get_articles(max_count):
    sents = {0: set(), 1: set()}
    for i in range(len(start_points)):
        start = start_points[i]
        links = [start]
        list_of_strings = []
        while len(sents[i]) <= max_count:
            links_temp = []
            for link in links:
                if len(list_of_strings) <= max_count: