コード例 #1
0
ファイル: LSA.py プロジェクト: csl-622/distance_measure_work
    def search_wiki(self, k):  # scrape query's wikipedia article
        article = Wikipedia().search(k)
        if (article == None):
            try:
                ny = wikipedia.page(k)
            except requests.exceptions.ConnectionError as e:
                return 'ReStArT'


#            except wikipedia.exceptions.HTTPTimeoutError as e:
#                print('here1')
#                print(str(e))
#            except requests.exceptions.ConnectTimeout as e:
#                print('here2')
#                print(str(e))
            except wikipedia.exceptions.DisambiguationError as e:
                try:
                    ny = wikipedia.page(e.options[0])
                except requests.exceptions.ConnectionError as e:
                    return 'ReStArT'
                except wikipedia.exceptions.DisambiguationError as e:
                    return -5
            except wikipedia.exceptions.PageError as e:
                return -5
            article = Wikipedia().search(ny.title)

        contents = [section.content for section in article.sections]
        kk = 0
        for content in contents:
            if (len(content) == 0):
                kk = kk + 1
        if (contents == [] or kk == len(contents)):
            try:
                ny = wikipedia.page(k)
            except requests.exceptions.ConnectionError as e:
                return 'ReStArT'
            except wikipedia.exceptions.DisambiguationError as e:
                try:
                    ny = wikipedia.page(e.options[0])
                except requests.exceptions.ConnectionError as e:
                    return 'ReStArT'
                except wikipedia.exceptions.DisambiguationError as e:
                    return -5
            content = ny.content
            if (len(content) == 0):
                return -5
            else:
                contents = []
                contents.append(content)
        d = []
        for content in contents:
            a = content.split()
            d.append(a)
        content = [
            j for i in d for j in i
            if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1
        ]  # take only meaningful content
        self.content = ' '.join(content)
        return self.content
コード例 #2
0
ファイル: Wikifier.py プロジェクト: fbkarsdorp/dbnl
    def extract_unique_nes(self, input_dir='../workspace/frog_periodicals', fresh=False,
                            max_documents=None, max_words_per_doc=None,
                            filename='../workspace/nes2wikilinks.p'):
        """
        Extracts all unique entities in the frogged files under input_dir as a dict.
        Registers in this dict: which relevant wiki-pages the NE could refer to
        according to the Wikipedia search interface.
        Only considers NEs that are:
            * capitalized
            * have len > 3 (cf. 'Van')
            * don't end in a dot (e.g. 'A.F.Th.')
            * tagged as B-PER by Frog
        """
        if fresh:
            print('Extracting NEs from ', max_documents, 'documents!')
            wikipedia = Wikipedia(language='nl', throttle=3)
            self.nes2wikilinks = {}

            for filepath in glob.glob(input_dir+'/*.txt.out'):
                max_words = max_words_per_doc
                for line in codecs.open(filepath, 'r', 'utf8'):
                    try:
                        comps = [c for c in line.strip().split('\t') if c]
                        idx, token, lemma, pos, conf, ne  = comps
                        token = token.replace('_', ' ')
                        if ne.startswith('B-PER') and token[0].isupper() and len(token) > 3 and not token.endswith('.'):
                            if token not in self.nes2wikilinks:
                                try: # to look up the page in wikipedia:
                                    article = wikipedia.search(token)
                                    if article: # if we find something...
                                        if article.categories[0] == 'Wikipedia:Doorverwijspagina': # we are dealing a referral page
                                            for link in article.links:
                                                if link in self.page_ids:
                                                    if token not in self.nes2wikilinks:
                                                        self.nes2wikilinks[token] = set()
                                                    self.nes2wikilinks[token].add(link)
                                        else:
                                            if article.title in self.page_ids:
                                                self.nes2wikilinks[token] = set([article.title])
                                except: # probably a download issue...
                                    continue
                        max_words -= 1
                        if max_words < 0:
                            break
                    except ValueError: # probably parsing error in the frog file
                        continue

                # update stats:
                max_documents -= 1
                if max_documents % 10 == 0:
                    print('\t+ ', max_documents, 'documents to go')
                    print('\t+ ', len(self.nes2wikilinks), 'NEs collected')
                if max_documents < 0:
                    break

            pickle.dump(self.nes2wikilinks, open(filename, 'wb'))

        else:
            self.nes2wikilinks = pickle.load(open(filename, 'rb'))
コード例 #3
0
ファイル: hw3.py プロジェクト: gradientex/HSE
 def get_articles(self, start, depth=1, max_count=1):
     article = Wikipedia().article(start)
     links = article.links
     list_of_strings = []
     for link in links:
         text = Wikipedia().article(link)
         text = self.text_cleaning(plaintext(text.source))
         list_of_strings.append(text)
     return list_of_strings
コード例 #4
0
ファイル: task_1.py プロジェクト: kartozia/Kartozia_4
 def get_articles(self, start):
     arr = []
     wiki = Wikipedia(language="en")
     article = wiki.article(start)
     arr.append(self.text_cleaning(article))
     for title in article.links:
         article = wiki.article(title)
         arr.append(self.text_cleaning(article))
     return arr
コード例 #5
0
ファイル: tests.py プロジェクト: anddonram/Painting4U
def testWikipedia(palabra):
    """
    Obtiene el articulo de la wikipedia
    """
    engine = Wikipedia(license=None, throttle=5.0)
    resultados = engine.search(palabra)

    print resultados.plaintext()
    return resultados
コード例 #6
0
def getFeaturedList():
    wiki = Wikipedia()
    article = wiki.search("Wikipedia:Featured articles")
    file = open("articalsTitle.txt", 'w')
    for section in article.sections:
        if section.string != "":
            for title in article.string.split('\n'):
                file.write(((str)(title)).strip() + "\n")
    file.close()
コード例 #7
0
ファイル: Wikifier.py プロジェクト: fbkarsdorp/dbnl
    def mentions_from_backlinks(self, backlinks={}, fresh=False, filename='../workspace/mentions.p', context_window_size=150):
        """
        Mines backlinking pages for mentions of the page_ids in backlinks.
        Returns 5 tuples, with for each mention:
            * target_id (correct page title)
            * the name variant (inside the a-tag)
            * left context of the mention (continguous character string, with len = context_window_size)
            * right context of the mention (continguous character string, with len = context_window_size)
            * a counter of other page_ids mentioned on the page
        """
        if not backlinks:
            backlinks = self.backlinks

        # intitialize data containers:
        target_ids, name_variants, left_contexts, right_contexts, page_counts = [], [], [], [], []

        if fresh:
            print('>>> Mining mentions from', sum([len(v) for k,v in backlinks.items()]),
                  'backlinking pages to', len(backlinks), 'target pages')
            print(backlinks)
            wikipedia = Wikipedia(language='nl', throttle=2)

            for idx, (page_id, links) in enumerate(backlinks.items()):
                print('\t + mining mentions of', page_id, '('+str(len(links)), 'backlinks) | page', idx+1, '/', len(backlinks))
                for backlink in links:
                    try:
                        article = wikipedia.search(backlink) # fetch the linking page via pattern
                        if not article.categories[0] == 'Wikipedia:Doorverwijspagina': # skip referral pages
                            print('\t\t* backlink:', backlink)
                            section_sources = [] # fetch the html-sections of individual sections:
                            if not article.sections: # article doesn't have sections
                                section_sources = [article.source]
                            else:
                                section_sources = [section.source for section in article.sections]
                            # loop over the section sources and extract all relevant mentions:
                            for section_source in section_sources:
                                ts, nvs, lcs, rcs, cnts = self.mentions_from_section(source=section_source,
                                                                                     target_id=page_id,
                                                                                     context_window_size=context_window_size)
                                if nvs:
                                    target_ids.extend(ts)
                                    name_variants.extend(nvs)
                                    left_contexts.extend(lcs)
                                    right_contexts.extend(rcs)
                                    page_counts.extend(cnts)
                    except:
                        continue

            pickle.dump((target_ids, name_variants, left_contexts, right_contexts, page_counts), open(filename, 'wb'))

        else:
            target_ids, name_variants, left_contexts, right_contexts, page_counts = \
                                                        pickle.load(open(filename, 'rb'))

        self.mentions = (target_ids, name_variants, left_contexts, right_contexts, page_counts)
コード例 #8
0
    def get_articles(self, start):

        article = Wikipedia().article(start)
        links = article.links
        list_of_strings = []
        for l in links:
            l_art = Wikipedia().article(l)
            l_text =  re.sub(' +', ' ', plaintext(l_art.source), flags = re.DOTALL)
            l_text = re.sub('[.;:!?\'\"-\(\)1234567890@,]+', '', l_text, flags = re.DOTALL).lower()
            list_of_strings.append(l_text.split())

        return list_of_strings
コード例 #9
0
def getFeaturedContent():
    wiki = Wikipedia()
    list = open("articalsTitle.txt", 'r')
    file = open("wikiData.txt", 'w')
    for i in range(2000):
        title = list.readline().replace("\n", "")
        article = wiki.search(title)
        if article is not None:
            for section in article.sections:
                if section.string != "":
                    file.write(section.string + "\n")
            time.sleep(0.2)
            print(title + " Get! " + str(i) + "/2000")
コード例 #10
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals",):
            for name in section.links:
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
コード例 #11
0
ファイル: properties.py プロジェクト: cartisan/goldfinger
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
コード例 #12
0
ファイル: celebrities.py プロジェクト: romanorac/botsvsquotes
def gender(name):
    """ Returns the gender of the given person (m/f).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        s = plaintext(p.string)
        s = s.lower()
        s = s.replace("\n", "\n ")
        m = sum(s.count(" %s " % x) for x in ("he", "his"))
        f = sum(s.count(" %s " % y) for y in ("she", "her"))
        g = m > f and "m" or "f"  # More "he" or more "she"?
        return g
    except:
        return None
コード例 #13
0
def gender(name):
    """ Returns the gender of the given person (m/f).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        s = plaintext(p.string)
        s = s.lower()
        s = s.replace("\n", "\n ")
        m = sum(s.count(" %s " % x) for x in ( "he", "his"))
        f = sum(s.count(" %s " % y) for y in ("she", "her"))
        g = m > f and "m" or "f" # More "he" or more "she"?
        return g
    except:
        return None
コード例 #14
0
ファイル: newsgrapher.py プロジェクト: ageek/sentiment
def isnews(topic):
    engine = Wikipedia()
    result = engine.search(topic)
    if result:
        if topic.lower() not in result.title.lower():
            return False
        newsthings = ['places','cities','capitals','countries','people','wars']
        categories = result.categories
        for category in categories:
            for thing in newsthings:
                if thing in category.lower():
                    return True
        return False
    else:
        return False
コード例 #15
0
ファイル: hw2.py プロジェクト: sasha-pivovarov/Prog4
class WikiParser:
    def __init__(self):
        self.wiki = Wikipedia(language="en")
        self.punc = re.compile('[%s]' % re.escape(punctuation))

    def get_articles(self, start, depth, max_count):
        iterations = 0
        links = [start]
        list_of_strings = []
        while iterations <= depth and len(list_of_strings) <= max_count:
            links_temp = []
            for link in links:
                if iterations <= depth and len(list_of_strings) <= max_count:
                    try:
                        article = self.wiki.article(link)
                        text = self.process(article.plaintext())
                        new_links = article.links
                        list_of_strings.append(text)
                        links_temp.extend(new_links)
                        print(f"Processed link {link}")
                    except AttributeError:
                        print(f"Skipped link {link}")
                        continue
                else:
                    break
            links = links_temp
            iterations += 1

        return list_of_strings

    def process(self, text):
        tokens = text.split(" ")
        return " ".join([self.punc.sub("", x.lower().strip()) for x in tokens])
コード例 #16
0
def isnews(topic):
    engine = Wikipedia()
    result = engine.search(topic)
    if result:
        if topic.lower() not in result.title.lower():
            return False
        newsthings = [
            'places', 'cities', 'capitals', 'countries', 'people', 'wars'
        ]
        categories = result.categories
        for category in categories:
            for thing in newsthings:
                if thing in category.lower():
                    return True
        return False
    else:
        return False
コード例 #17
0
ファイル: celebrities.py プロジェクト: romanorac/botsvsquotes
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    # Extract the links from this page, excluding links in the footnotes section,
    # or links to band names (we just want names of individuals).
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals", ):
            for name in section.links:
                # Yes = * [Rob Zombie], musician and filmmaker
                # No  = * Mark White, bass guitarist for [Spin Doctors]
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
コード例 #18
0
def celebrities():
    """ Returns a list of celebrities from Wikipedia.
    """
    celebrities = set()
    w = Wikipedia(language="en")
    p = w.search("List of people on the cover of Rolling Stone", cached=True)
    s = p.plaintext()
    # Extract the links from this page, excluding links in the footnotes section,
    # or links to band names (we just want names of individuals).
    for section in p.sections:
        if section.parent and section.parent.title in ("Individuals",):
            for name in section.links:
                # Yes = * [Rob Zombie], musician and filmmaker
                # No  = * Mark White, bass guitarist for [Spin Doctors]
                if "* " + name in s:
                    celebrities.add(name)
    return celebrities
コード例 #19
0
def crawl_wiki():
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords()
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub(
                            '-', ' ',
                            keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub(
                        '-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/txt/wiki.pkl')
    return wikis
コード例 #20
0
def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse 
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)" # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0] # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r) # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)" # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None
コード例 #21
0
def crawl_wiki(model_path):
    engine = Wikipedia(license=None, throttle=1.0, language='en')
    wikis = {}
    keywords = get_keywords(model_path=model_path, threshold=0.001)
    for keyword in keywords:
        stop = False
        while stop is False:
            try:
                article = engine.search(query=keyword)
            except Exception as e:
                print str(e)
                article = None

            if type(article) is pattern.web.WikipediaArticle:
                if article.disambiguation is False:
                    print '\nretrieving', keyword, '...',
                    wikis[keyword] = {}
                    wikis[keyword]['keyword'] = keyword
                    wikis[keyword]['text'] = article.plaintext()
                    stop = True
                else:
                    print '\n[', keyword, '] leads to disambiguation page!',
                    stop = True

                    if '-' in keyword:
                        keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                        stop = False
                    if keyword.islower() and len(keyword) <= 5:
                        keyword = keyword.upper()
                        stop = False
            else:
                print '\n[', keyword, '] doesn\'t exist on wikipedia!',
                stop = True

                if '-' in keyword:
                    keyword = re.sub('-', ' ', keyword)  # convert hyphen into white space
                    stop = False
                if keyword.islower() and len(keyword) <= 5:
                    keyword = keyword.upper()
                    stop = False

    enpickle(wikis, 'data/others/wikis.pkl')
    print '\n'
    return wikis
コード例 #22
0
def education(name, discrete=False, raw=False):
    """ Returns the education level of the given person (0.0-1.0).
    """
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
      # e = the percentage of links to articles about academic titles / achievements.
        e = [t("a[href*='%s']" % x) for x in (
            "academi"    , "academy_of" , "bachelor_of" , "college"     , 
            "degree"     , "doctor"     , "emeritus"    , "engineer"    , 
            "faculty"    , "fellow"     , "genius"      , "grandmaster" , 
            "institut"   , "invent"     , "master_of"   , "mathemati"   , 
            "phd"        , "ph.d"       , "physics"     , "professor"   , 
            "school_of"  , "scien"      , "student"     , "universi"    , 
            "valedictor" , 
        )]
        e = sum(map(len, e))
        e = e / float(len(t("a")))
        if raw:
            return e
        # Convert e to a human-interpretable range (0.0-1.0),
        # based on observations in the list of p people below,
        # i.e., Pattie Maes should be > 0.9, Miley Cirus < 0.5.
        e = max(e, 0.0)
        e = min(e, 1.0)
        m = {
            0.000: 0.40,
            0.003: 0.50,
            0.010: 0.60,
            0.020: 0.70,
            0.030: 0.80,
            0.060: 0.90,
            1.000: 1.00
        }
        for x, y in zip(sorted(m), sorted(m)[1:]):
            if y > e:
                e = m[x] + (m[y] - m[x]) * (e - x) / (y - x)
                break
        # With discrete=True, returns "+" or "-".
        e = e if not discrete else ("-", "+")[e > 0.01]
        return e
    except:
        return None
コード例 #23
0
ファイル: celebrities.py プロジェクト: romanorac/botsvsquotes
def age(name):
    """ Returns the age of the given person.
    """
    # Use regular expression to try and parse
    # a number of date formats from Wikipedia.
    try:
        w = Wikipedia(language="en")
        p = w.search(name, cached=True)
        t = DOM(p.src)
        s = plaintext(p.string)
        s = re.sub(r"\[[0-9]+\]", "", s)
        r = r"\(born ([0-9]+ [A-Za-z]+ )?([0-9]{4})\)"  # (born 1 December 2000)
        x = t(".bday")
        y = t(".dday")
        x = x[0].content if x else re.search(r, s).group(2)
        y = y[0].content if y else str(date().year)
        x = plaintext(x)
        y = plaintext(y)
        x = x.split("-")[0]  # YYYY-MM-DD
        y = y.split("-")[0]
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = ur"[A-Za-z]+ [0-9]+, ([0-9]{4})"
        r = ur"\(%s – %s\)" % (r, r)  # (May 15, 1912 – October 7, 2003)
        x = re.search(r, s).group(1)
        y = re.search(r, s).group(2)
        a = int(y) - int(x)
        return a
    except:
        pass
    try:
        r = r"\(aged ([0-9]+)\)"  # (aged 78)
        a = t(".infobox td:contains('aged')")
        a = a[0].content
        a = plaintext(a)
        a = re.search(r, a).group(1)
        a = int(a)
        return a
    except:
        pass
    return None
コード例 #24
0
 def get_articles(self, start):
     from pattern.web import Wikipedia, plaintext
     file = open(start+'.txt', 'w')
     file.write('')
     file.close()
     list_of_strings = []
     start_article  = Wikipedia().article(start)
     links = set(start_article.links)
     for link in start_article.links:
         cur_article = Wikipedia().article(link)
         if cur_article != None:
             newtext = plaintext(cur_article.source)
             newtext = self.normalize(newtext)
             list_of_strings.append(newtext)
             file = open(start+'.txt', 'a')
             file.write(newtext.encode('utf-8'))
             file.write('\n\n-----\n\n')
             file.close()
     return list_of_strings
コード例 #25
0
    def get_articles(self, start):
        list_of_strings = []
        article = Wikipedia().article(start).plaintext()
        list_of_strings.append(self.plain_text(article))

        #for link in Wikipedia().article(start).links:
        #    if Wikipedia().article(link).language is 'en':
        #        list_of_strings.append(self.plain_text(link))

        return list_of_strings
コード例 #26
0
 def show_results(self):
     parser = WikiParser()
     articles = parser.get_articles('Natural language processing')
     art_statistics = TextStatistics(articles)
     print('топ-20 3-грамм по корпусу статей\r\n' + art_statistics.get_top_3grams(20))
     print('топ-20 слов по корпусу статей\r\n' + art_statistics.get_top_words(20))
     
     article = Wikipedia().article('Natural language processing')
     a_statistics = TextStatistics(article)
     print('топ-5 3-грамм по самой статье \r\n' + a_statistics.get_top_3grams(5))
     print('топ-5 слов по самой статье \r\n' + a_statistics.get_top_words(5))
コード例 #27
0
ファイル: Wikipedia.py プロジェクト: valbarsau/leisuretime
def wiki(titulo):
    article = Wikipedia(language="en").search(query=titulo)
    result=[]
    if article:
        for section in article.sections:
            result.append(section.title+"\n"+section.string+"\n")
            for link in section.links:
                result.append(link+"\n")

    return result
        
    
コード例 #28
0
ファイル: search_wiki.py プロジェクト: byteface/sing
def run(o):

	#http://www.clips.ua.ac.be/pages/pattern-web#mail
	# should be able to do some cool stuff with the pattern libs	

	import os, sys;# sys.path.insert(0, os.path.join("..", ".."))

	from pattern.web import Wikipedia

	# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
	# Wikipedia queries request the article HTML source from the server. This can be slow.
	# It is a good idea to cache results from Wikipedia locally,
	# and to set a high timeout when calling Wikipedia.search().

	engine = Wikipedia(language="en")

	# Contrary to the other search engines in the pattern.web module,
	# Wikipedia simply returns one WikipediaArticle object (or None),
	# instead of a list of results.
	article = engine.search("alice in wonderland", cached=True, timeout=30)

	print article.title               # Article title (may differ from the search query).
	print
	print article.languages["fr"]     # Article in French, can be retrieved with Wikipedia(language="fr").
	print article.links[:10], "..."   # List of linked Wikipedia articles.
	print article.external[:5], "..." # List of external URL's.
	print

	#print article.source # The full article content as HTML.
	#print article.string # The full article content, plain text with HTML tags stripped.

	# An article is made up of different sections with a title.
	# WikipediaArticle.sections is a list of WikipediaSection objects.
	# Each section has a title + content and can have a linked parent section or child sections.
	for s in article.sections:
	    print s.title.upper()
	    print 
	    print s.content # = ArticleSection.string, minus the title.
	    print
	    
コード例 #29
0
    def get_articles(self, search_term):
        text_list = []

        article = Wikipedia().search(search_term)
        links = article.links

        s = self.__get_text(search_term)
        text_list.append(s)

        for link in links:
            text_list.append(self.__get_text(link))
            print(link)

        return text_list
コード例 #30
0
	def search(self, query, language='es'):
		'''
			query: string
			language: 'en' or 'es'
		'''
		wikipedia = Wikipedia(language=language)
		google_result_list = self.google.simple_search(query + ' ' + 'wikipedia')
		wikipedia_results = []
		for result in google_result_list:
			try:
				if self.url_pattern in result['link']:
					article = {}
					title = result['title'].split(' - ')[0]
					print title
					art = wikipedia.search(title)
					print art
					article['title'] = art.title
					article['text'] = art.string
					article['related'] = art.links
					wikipedia_results.append(article)
			except:
				pass
		return wikipedia_results
コード例 #31
0
ファイル: lsa.py プロジェクト: suraj-deshmukh/pyLSA
 def search_wiki(self, k):  # scrape query's wikipedia article
     article = Wikipedia().search(k)
     contents = [
         section.content.encode("utf8") for section in article.sections
     ]
     d = []
     for content in contents:
         a = content.split()
         d.append(a)
     content = [
         j for i in d for j in i
         if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1
     ]  # take only meaningful content
     self.content = ' '.join(content)
     return self.content
コード例 #32
0
ファイル: mine.py プロジェクト: nirabhratapaswi/nlp
def get_info(search_query):
	if isinstance(search_query, str):
		search_query = str(search_query)
	else:
		return { "Error": "Pass a string, from mine.py [7]", "Result": [None] }

	result = []
	engineGoogle = Google(license=None, throttle=0.5, language=None)
	engineBing = Bing(license=None, throttle=0.5, language=None)
	engineTwitter = Twitter(license=None, throttle=0.5, language=None)
	engineFacebook = Facebook(license=None, throttle=1.0, language='en')
	engineWikipedia = Wikipedia(license=None, throttle=5.0, language=None)
	engineFlickr = Flickr(license=None, throttle=5.0, language=None)
	engineArray = [engineGoogle, engineBing, engineTwitter, engineFacebook, engineWikipedia, engineFlickr]
	engineArray = [engineGoogle, engineTwitter]

	'''
	for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engine[0].search(search_query, type=SEARCH, start=i, count=5)])
		[result.append([result.append(repr(plaintext(para.text))) for para in engine.search(search_query, type=SEARCH, start=i, count=5)]) for engine in engineArray]
			# print repr(plaintext(para.text))
			# print repr(plaintext(para.url)) + '\n\n'
			# result.append(repr(plaintext(para.text)))
	'''

	# Google
	for i in range(1, 5):
		result = result + ([para.text for para in engineGoogle.search(search_query, type=SEARCH, start=i, count=10)])
		
	for i in range(1, 5):
		result = result + ([para.text for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	'''
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineBing.search(search_query, type=SEARCH, start=i, count=5)])
	for i in range(1,2):
		result = result + ([repr(plaintext(para.text)) for para in engineTwitter.search(search_query, type=SEARCH, start=i, count=10)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFacebook.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineWikipedia.search(search_query, type=SEARCH, start=i, count=5)])
	# for i in range(1,2):
		# result = result + ([repr(plaintext(para.text)) for para in engineFlickr.search(search_query, type=SEARCH, start=i, count=5)])
	'''

	return { "Error": None, "Result": result }

	# return { "Error": None, "Result": ['Hello World', 'Bye Bye Tommy'] }
コード例 #33
0
    def __get_text(self, search_term):

        article = Wikipedia().search(search_term)
        if article:
            s = article.string
            if s:
                while '\n' in s:
                    s = s.replace('\n', ' ')
                while '  ' in s:
                    s = s.replace('  ', ' ')

                s = re.sub(r'[^\w\s]', '', s)
                s = s.lower()
                return s

            else:
                return None

        else:
            return None
コード例 #34
0
    def __init__(self):
        from pattern.web import Wikipedia, plaintext
        import string
        import re
        self.parser = WikiParser()
        self.articles = self.parser.get_articles('Natural_language_processing')
        self.stats = TextStatistics(self.articles)
        self.grams = self.stats.get_top_3grams()
        self.words = self.stats.get_top_words()
        self.top_grams_out = [' - '.join([self.grams[0][i], str(self.grams[1][i])]) for i in range(20)]
        self.top_grams = self.grams[0][:20]
        self.top_words_out = [' - '.join([self.words[0][i], str(self.words[1][i])]) for i in range(20)]
        self.top_words = self.words[0][:20]

        nlp = Wikipedia().article('Natural_language_processing')
        nlptext = plaintext(nlp.source)
        parse = WikiParser()
        nlptext = parse.normalize(nlptext)
        self.nlp_stats = TextStatistics([nlptext])
        self.nlp_top_grams = [' - '.join([self.nlp_stats.get_top_3grams()[0][i], str(self.nlp_stats.get_top_3grams()[1][i])]) for i in range(20)][:5]
        self.nlp_top_words = [' - '.join([self.nlp_stats.get_top_words()[0][i], str(self.nlp_stats.get_top_words()[1][i])]) for i in range(20)][:5]
コード例 #35
0
    def get_wiki_article(self,
                         search,
                         separate_in_section=False,
                         type_of_text=u'Plain text'):
        segments = list()
        article = Wikipedia(language=self.dico_lang[self.language]).search(
            search, cached=False)
        if article:
            if separate_in_section:
                for section in article.sections:
                    if type_of_text == u'Plain text':
                        wiki_article = Input(section.string)
                    else:
                        wiki_article = Input(section.html)

                    annotations = {
                        'source': 'Wikipedia',
                        'section title': section.title,
                        'section level': section.level,
                        'search': search,
                    }
                    segment = wiki_article[0]
                    segment.annotations.update(annotations)
                    wiki_article[0] = segment
                    segments.append(wiki_article)
            else:
                if type_of_text == u'Plain text':
                    wiki_article = Input(article.string)
                else:
                    wiki_article = Input(article.html)
                annotations = {
                    'source': 'Wikipedia',
                    'search': search,
                }
                segment = wiki_article[0]
                segment.annotations.update(annotations)
                wiki_article[0] = segment
                segments.append(wiki_article)
        return segments
コード例 #36
0
ファイル: Wiki_Avg_Length.py プロジェクト: byronwasti/SoftDes
def Avg(s):
    ''' The Averaging Function
        Takes in a string, searches for the Wiki article
        and then returns the average length of words.
        No there is no error handling.
    '''
    length = 0
    numb = 0

    #
    if s in db:
        return float(db[s])
    article = Wikipedia().search(s)
    for i in xrange(len(article.sections)):
        if article.sections[i].title == "See also":
            break
        text = article.sections[i].content.replace('\n', ' ')
        for words in text.split(' '):
            numb += 1
            numbtmp += 1
    db[s] = str(float(length) / float(numb))
    return float(length) / float(numb)
コード例 #37
0
            self.writerow(row)
			
output = open("Lua_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Lua_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('Lua Programming Language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
コード例 #38
0
import os, sys; sys.path.insert(0, os.path.join("..", ".."))

from pattern.web import Wikipedia

# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# A query requests the article's HTML source from the server, which can be quite slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().

engine = Wikipedia(language="en")

# Contrary to other search engines in the module,
# Wikipedia simply returns one WikipediaArticle object (or None) instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

print article.title               # Article title (may differ from the search query).
print
print article.languages["fr"]     # Article in French, can be retrieved with Wikipedia(language="fr").
print article.links[:10], "..."   # List of linked Wikipedia articles.
print article.external[:5], "..." # List of external URL's.
print

#print article.source # The full article content as HTML.
#print article.string # The full article content, plain text with HTML tags stripped.

# An article is made up of different sections with a title.
# WikipediaArticle.sections is a list of WikipediaSection objects.
# Each section has a title + content and can have a linked parent section or child sections.
for s in article.sections:
    print s.title.upper()
    print 
コード例 #39
0
ファイル: main.py プロジェクト: meadhikari/wiki-movie-plot
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from pattern.web import Wikipedia
import webapp2
import logging
import jinja2
from convert2html import plaintext2html
from urllib2 import quote
from backend import plot,title_from_imdb,more_movie_info
jinja_env = jinja2.Environment(loader = jinja2.FileSystemLoader("template"), autoescape = True)
import os, sys; sys.path.insert(0, os.path.join("..", ".."))
engine = Wikipedia(language="en")
class Handler(webapp2.RequestHandler):
  def write(self,*a,**kw):
    self.response.out.write(*a,**kw)
  def render_str(self,template,**params):
    t = jinja_env.get_template(template)
    return t.render(params)
  def render(self,template,**kw):
    self.write(self.render_str(template, **kw))

class MainHandler(Handler):
    def get(self):
       self.render("index.html")

    def post(self):
		movie_name = self.request.get('movie_name')
コード例 #40
0
            self.writerow(row)
			
output = open("better_ruby_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Ruby_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('ruby programming language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
コード例 #41
0
"""
Retrieves an article from Wikipedia (http://en.wikipedia.org).
"""
from pattern.web import Wikipedia

ENGINE = Wikipedia(language="en")


def main(search_query):
    """ Returns Twitter Search Results
    :param search_query: (str)
    """
    final = "Wikipedia Search Results:"
    article = ENGINE.search(search_query, cached=True, timeout=30)

    print article.title  # Article title (may differ from the search query).
    print ""
    print article.languages[
        "tr"]  # Article in Turkish, can be retrieved with Wikipedia(language="tr").
    print article.links[:10]  # List of linked Wikipedia articles.
    print article.external[:5]  # List of external URL's.
    print ""

    for s in article.sections:
        final = final + "\n\n" + s.title.upper() + "\n" + s.content

    return final


if __name__ == '__main__':
    main("")
コード例 #42
0
ファイル: Wikipedia.py プロジェクト: valbarsau/leisuretime
def descarga(titulo):
    engine = Wikipedia(language="en")
    result= engine.search(titulo, type=SEARCH)
    return repr(plaintext(result.string))
コード例 #43
0
ファイル: test.py プロジェクト: rboling/data_analysis_work
            self.writerow(row)
			
output = open("new_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'Python_(programming_language)&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('python programming language')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
コード例 #44
0
            self.writerow(row)
			
output = open("ocaml_scraping_output.csv", "wb")
writer = UnicodeWriter(output)		

writer.writerow(["Article Title", "Date", "IP", "Bytes of Edit", "IP Location", "latitude", "longitude"])
	


url_string = 'http://en.wikipedia.org/w/index.php?title=' + 'OCaml&offset=&limit=500' + '&action=history'

url = URL(url_string)

dom = DOM(url.download(cached=True))

engine = Wikipedia(license=None)

article = engine.search('Ocaml')

a = 0

while (len(dom.by_class("mw-nextlink")) > 0):
  page_history_links = dom.by_tag("ul")[0].by_tag("li")
  for link in page_history_links:
    date = HTMLParser.HTMLParser().unescape(link.by_class("mw-changeslist-date")[0].content).encode('ascii','ignore') 
    ip = HTMLParser.HTMLParser().unescape(link.by_class("history-user")[0].by_tag("a")[0].content).encode('ascii','ignore')  
    bytes = HTMLParser.HTMLParser().unescape(link.by_class("history-size")[0].content).encode('ascii','ignore')  
    ip_url = 'http://api.hostip.info/get_json.php?ip=' + ip + '&position=true'
    req = urllib2.urlopen(ip_url)
    req_request = urllib2.Request(ip_url)
    #handler = urllib2.urlopen(req)
# Word frequency.

from pattern.web import Wikipedia
from pattern.vector import words

frequency = {}

# Spreading activation.
# Parse links from seed article & visit those articles.

links, seen = set(["Italia"]), {}

while len(links) > 0:

    try:
        article = Wikipedia(language="it").search(links.pop(), throttle=10)
        seen[article.title] = True

        # Parse links from article.

        for link in article.links:

            if link not in seen:
                links.add(link)

        # Parse words from article. Count words.

        for word in words(article.string):

            if word not in frequency:
                frequency[word] = 0
コード例 #46
0
ファイル: 07-wikipedia.py プロジェクト: vijaynandwani/pattern
from __future__ import print_function
import os
import sys
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))

from pattern.web import Wikipedia

# This example retrieves an article from Wikipedia (http://en.wikipedia.org).
# Wikipedia queries request the article HTML source from the server. This can be slow.
# It is a good idea to cache results from Wikipedia locally,
# and to set a high timeout when calling Wikipedia.search().

engine = Wikipedia(language="en")

# Contrary to the other search engines in the pattern.web module,
# Wikipedia simply returns one WikipediaArticle object (or None),
# instead of a list of results.
article = engine.search("alice in wonderland", cached=True, timeout=30)

# Article title (may differ from the search query).
print(article.title)
print()
# Article in French, can be retrieved with Wikipedia(language="fr").
print(article.languages["fr"])
print(article.links[:10], "...")  # List of linked Wikipedia articles.
print(article.external[:5], "...")  # List of external URL's.
print()

# print article.source # The full article content as HTML.
# print article.string # The full article content, plain text with HTML
# tags stripped.
コード例 #47
0
def main():
    article = Wikipedia(language="en").search('small', throttle=10)
    print article.string
コード例 #48
0
def isa(entity, type=PERSON):
    """ Returns True if the given entity is of the given type.
    """
    # - Wikipedia.search() returns a WikipediaArticle:
    #   http://www.clips.ua.ac.be/pages/pattern-web#wikipedia
    # - The article comes with the HTML source code.
    # - The article comes with a plaintext version (no HTML tags).
    # - We can count how many times a word occurs in the plain text
    #   (e.g., articles about cities don't often use "he" or "she").
    # - We can search the HTML parse tree with CSS selectors.
    #   (e.g., the HTML often has a <div class="infobox"> with interesting metadata).
    try:
        w = Wikipedia(language="en")
        p = w.search(entity, cached=True)
        t = DOM(p.src) # HTML parse tree
        s = p.plaintext()
        s = s.lower()
        s = s.replace(".", " ")
        s = s.replace(",", " ")
        s = s.replace("'", " ")
        s = s.replace('"', " ")
        n = s.count(" ") * 1.0 or 0.0 # approximate word count
    except:
        pass
    # A person is an entity with a biography, a birthdate, and
    # a life's description containing gender-specific pronouns
    # (e.g., Noam Chomsky, Arnold Schwarzenegger).
    if type == PERSON:
        if t(".infobox.biography"):
            return True
        if t(".infobox th:contains('born')"):
            return True
        if any("early life" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in ( "he", "his")) / n > 0.01: # 1% he
            return True
        if sum(s.count(" %s " % w) for w in ("she", "her")) / n > 0.01: # 1% she
            return True
    # A place is an entity with a geography and/or a population
    # (e.g., New York, Jupiter, Middle Earth).
    if type == PLACE:
        if t(".infobox.geography"):
            return True
        if t(".infobox th:contains('coordinates')"):
            return True
        if t(".infobox th:contains('location')"):
            return True
        if t(".infobox th:contains('population')"):
            return True
        if t(".infobox th:contains('orbital period')"):
            return True
        if t("h2:contains('climate')"):
            return True
        if t("h2:contains('architecture')"):
            return True
        if any("geography" in x.title.lower() for x in p.sections):
            return True
        if any("flora" in x.title.lower() for x in p.sections):
            return True
        if any("fauna" in x.title.lower() for x in p.sections):
            return True
        if sum(s.count(" %s " % w) for w in (
          "city",
          "country",
          "house",
          "land",
          "location",
          "place",
          "room",
          "rooms",
          "space",
          "setting", 
          "town")) / n > 0.01:
            return True
    return False
コード例 #49
0
ファイル: wiki_data2-7.py プロジェクト: Krolov18/Languages
def recherche_wikipedia(recherche, language):
	"""
		Fonction principale de ce script python 27 qui fait une requête
		wikipedia afin de récupérer les informations sur une page
		wikipedia.
		
		La fonction n'a formatté que les informations des recherches
		"fr" et "en" et plus précisément les sections pour "fr"
		Fiche technique, distribution, voix françaises et originales
		et en "en" le casting. Cette fonction peut évidemment être
		étoffée pour récupérer plus d'informations.
	"""
	datas = yaml.load(open("/home/krolev/Documents/Projet_media/BIBLIOTHEQUES/formats_acceptes.yaml"))
	engine = Wikipedia(language=language)
	searching = engine.search(recherche)
	Sections = searching.sections
	metadata = {}
	def fonction(part=True,sepa = ":"):
		"""
			fonction interne à recherche wikipedia qui permet de mettre
			en forme le texte récupéré pour être formatté avant le
			passage dans YAML.load()
		"""
		temp = [x.strip() for x in section.content.replace('* ','').split('\n') if x != u""]
		liste = []
		for element in temp:
			element = element.encode('utf-8')
			if part:
				(cle,sep,attr) = element.partition(sepa)
			else:
				(cle,sep,attr) = element.rpartition(sepa)
			attr = attr.strip()
			cle = cle.strip()
			if "'" in cle:
				attr = attr.replace("'","''")
			if "'" in attr:
				attr = attr.replace("'","''")
			if ":" in cle:
				cle = cle.replace(':','--')
			if ":" in attr:
				attr = attr.replace(":","--")
			element = " ".join([x for x in [cle+sep, attr] if x != '""'])
			if element_inString(element,datas["countries"]):
				element = " "+element
			elif (not ":" in element):
				element = " "+element
			liste.append(element)
		return liste
	
	if language == "fr":
		for section in Sections:
			if section.title == u"Fiche technique":
				metadata.update({"Fiche_technique":yaml.load("\n".join(fonction()[1:-1]))})
			elif section.title == u"Distribution":
				temp = fonction()
				if len(temp) != 1:
					metadata.update({"Distribution":yaml.load("\n".join(fonction(part=False)[1:-1]))})
			elif section.title == u"Voix françaises":
				metadata.update({u"Voix françaises":yaml.load('\n'.join(fonction()[1:-1]))})
			elif section.title == u"Voix originales":
				metadata.update({"Voix originales":yaml.load('\n'.join(fonction()[1:-1]))})
	if language == "en":
		for section in sections:
			if section.title == 'Cast':
				liste = []
				for element in fonction(sepa="as")[1:-1]:
					(cle, sep, val) = element.partition('as')
					element = cle+":"+val
					liste.append(element)
				metadata.update({"Casting":yaml.load('\n'.join(liste))})
	#return metadata
	return yaml.dump(metadata, default_flow_style = False, allow_unicode = True)