Ejemplo n.º 1
0
    def get(self):
        wikipedia.set_lang(u'ru')
        try:
            page = wikipedia.page(u'Проект:Города_России/Списки_улиц/Казани')
            streets = []
            for link in page.links:
                nlink = unicode(link).encode('utf-8').strip().decode('utf-8')
                norm_name = normalize(nlink)
                try:
                    street_info = StreetInfo.get_by_norm_name(norm_name)
                    if not street_info:
                        street_info = StreetInfo()

                    street_page = wikipedia.page(nlink)
                    street_info.name = nlink
                    street_info.norm_name = norm_name
                    street_info.info = unicode(street_page.summary).encode('utf-8').strip()
                    street_info.images = [Image(url=x) for x in street_page.images]
                    street_info.city = u'Казань'.encode('utf-8').strip()
                    street_info.lang=u'ru'.encode('utf-8').strip()
                    
                    street_info.put()
                    
                except Exception, e:
                    print nlink.encode('utf-8')
        except DeadlineExceededError:
            pass
        
        self.response.headers['Content-Type'] = "text/html; charset=utf-8"
        self.response.write(json.dumps({'success':True}))
Ejemplo n.º 2
0
  def test_redirect_normalization(self):
    """Test that a page redirect loads correctly with or without a query normalization"""
    capital_party = wikipedia.page("Communist Party", auto_suggest=False)
    lower_party = wikipedia.page("communist Party", auto_suggest=False)

    self.assertIsInstance(capital_party, wikipedia.WikipediaPage)
    self.assertIsInstance(lower_party, wikipedia.WikipediaPage)
    self.assertEqual(capital_party.title, "Communist party")
    self.assertEqual(capital_party, lower_party)
Ejemplo n.º 3
0
 def get(self):
     wikipedia.set_lang(u"ru")
     page = wikipedia.page(u"Проект:Города_России/Списки_улиц/Казани")
     streets = []
     for link in page.links:
         nlink = unicode(link).encode("utf-8").strip()
         try:
             street_page = wikipedia.page(nlink)
             streets.append(
                 {"name": nlink, "info": street_page.summary, "images": street_page.images, "city": u"Казань"}
             )
         except Exception, e:
             print nlink
Ejemplo n.º 4
0
def wiki(bot, event, *args):
    """
    *Wikipedia:*
    Usage: /wiki <keywords to search for> <optional: sentences to display [defaults to 3]>
    Purpose: Get summary from Wikipedia on keywords.
    """
    from wikipedia import wikipedia, PageError, DisambiguationError
    yield from bot.send_typing(event.conv)

    def summary(self, sentences=3):
        if not getattr(self, '_summary', False):
            query_params = {
                'prop': 'extracts',
                'explaintext': '',
                'exintro': '',
            }
        query_params['exsentences'] = sentences
        if not getattr(self, 'title', None) is None:
            query_params['titles'] = self.title
        else:
            query_params['pageids'] = self.pageid

        request = wikipedia._wiki_request(query_params)
        self._summary = request['query']['pages'][self.pageid]['extract']

        return self._summary

    wikipedia.WikipediaPage.summary = summary
    try:
        sentences = 3
        try:
            if args[-1].isdigit():
                sentences = args[-1]
                args = args[:-1]
            page = wikipedia.page(' '.join(args))
        except DisambiguationError as e:
            page = wikipedia.page(wikipedia.search(e.options[0], results=1)[0])
        segments = [
            hangups.ChatMessageSegment(page.title,
                                       hangups.SegmentType.LINK,
                                       is_bold=True,
                                       link_target=page.url),
            hangups.ChatMessageSegment('\n', hangups.SegmentType.LINE_BREAK),
            hangups.ChatMessageSegment(page.summary(sentences=sentences))
        ]

        yield from bot.send_message_segments(event.conv, segments)
    except PageError:
        yield from bot.send_message(
            event.conv,
            "Couldn't find \"{}\". Try something else.".format(' '.join(args)))
Ejemplo n.º 5
0
    def test_disambiguate(self):
        """Test that page raises an error when a disambiguation page is reached."""
        try:
            ram = wikipedia.page("Dodge Ram (disambiguation)", auto_suggest=False, redirect=False)
            error_raised = False
        except wikipedia.DisambiguationError as e:
            error_raised = True
            options = e.options

        self.assertTrue(error_raised)
        self.assertEqual(
            options,
            [
                u"Dodge Ramcharger",
                u"Dodge Ram Van",
                u"Dodge Mini Ram",
                u"Dodge Caravan C/V",
                u"Dodge Caravan C/V",
                u"Ram C/V",
                u"Dodge Ram 50",
                u"Dodge D-Series",
                u"Dodge Rampage",
                u"Ram (brand)",
            ],
        )
Ejemplo n.º 6
0
def main():
    _tmpb = get_biographies(CATEGORY, STARTTIME, ENDTIME)

    print(str(len(_tmpb)))

    if len(_tmpb) == 0:
        sys.exit()

    table_data = {}

    tableservice = table_service()
    requests.packages.urllib3.disable_warnings()

    for r in _tmpb:
        try:
            print(r)
            p = wikipedia.page(title=None, pageid=str(r['pageid']))
            _revs = [
                r for r in get_revisions(str(p.pageid)) if "delet" in str(r)
            ]
            #table_data.update({str(r['pageid']):{'PAGEID': str(p.pageid),'TOUCHED': str(p.touched),'URL': str(p.url),'TITLE': str(p.title)}})
            _task = create_task(str(DATASET_MARKER), str(r['timestamp']),
                                str(p.pageid),
                                str(random.randint(100000, 99999999)),
                                str(p.pageid), str(p.title), _revs, str(p.url))
            print(_task)
            tableservice.insert_entity(AZURE_TABLE, _task)
        except Exception as e:
            print("Error: %s" % e)
            continue
Ejemplo n.º 7
0
  def test_auto_suggest(self):
    """Test that auto_suggest properly corrects a typo."""
    # yum, butter.
    butterfly = wikipedia.page("butteryfly")

    self.assertEqual(butterfly.title, "Butterfly")
    self.assertEqual(butterfly.url, "http://en.wikipedia.org/wiki/Butterfly")
Ejemplo n.º 8
0
 def test_outlinks(self):
     links = self.dao.fetch_outlinks("Paris", 15)
     all_links = (wiki.page("Paris")).links
     self.assertEqual(len(links), len(set(links)))
     for link in links:
         self.assertIn(link, all_links)
     self.assertEqual(len(links), 15)
Ejemplo n.º 9
0
  def test_redirect_true(self):
    """Test that a page successfully redirects a query."""
    # no error should be raised if redirect is test_redirect_true
    mp = wikipedia.page("Menlo Park, New Jersey")

    self.assertEqual(mp.title, "Edison, New Jersey")
    self.assertEqual(mp.url, "http://en.wikipedia.org/wiki/Edison,_New_Jersey")
Ejemplo n.º 10
0
  def test_redirect_true(self):
    """Test that a page successfully redirects a query."""
    # no error should be raised if redirect is test_redirect_true
    mp = wikipedia.page("Menlo Park, New Jersey")

    self.assertEqual(mp.title, "Edison, New Jersey")
    self.assertEqual(mp.url, "http://en.wikipedia.org/wiki/Edison,_New_Jersey")
Ejemplo n.º 11
0
  def test_auto_suggest(self):
    """Test that auto_suggest properly corrects a typo."""
    # yum, butter.
    butterfly = wikipedia.page("butteryfly")

    self.assertEqual(butterfly.title, "Butterfly")
    self.assertEqual(butterfly.url, "http://en.wikipedia.org/wiki/Butterfly")
Ejemplo n.º 12
0
    def test_disambiguate(self):
        """Test that page raises an error when a disambiguation page is reached."""
        try:
            ram = wikipedia.page("Dodge Ram (disambiguation)",
                                 auto_suggest=False,
                                 redirect=False)
            error_raised = False
        except wikipedia.DisambiguationError as e:
            error_raised = True
            options = e.options

        self.assertTrue(error_raised)
        self.assertEqual(
            options,
            [
                u"Dodge Ramcharger",
                u"Dodge Ram Van",
                u"Dodge Mini Ram",
                u"Dodge Caravan C/V",
                u"Dodge Caravan C/V",
                u"Ram C/V",
                u"Dodge Ram 50",
                u"Dodge D-Series",
                u"Dodge Rampage",
                u"Ram (brand)",
            ],
        )
Ejemplo n.º 13
0
    def test_find_icd_section_title(self):
        wikipedia_client: WikipediaClient = WikipediaClient("en")
        parsed_response_content: dict = wikipedia_client.search_title("ICD-10")
        icd_list_page_title: str = parsed_response_content["query"]["search"][0]["title"]
        icd_list_page_html: str = str(wikipedia.page(icd_list_page_title).html())
        disease_group_page_title: str = HtmlParser.find_icd_section_title(icd_list_page_html, "E10.3")

        self.assertEqual("ICD-10 Chapter IV: Endocrine, nutritional and metabolic diseases", disease_group_page_title)
Ejemplo n.º 14
0
def title(query: str) -> str:
    wikipedia.set_lang('en')
    search = wikipedia.search(query)[0]
    return json.dumps(
        filterResult(
            wikipedia.page(
                search['title']
            )))
Ejemplo n.º 15
0
def tell_me_about(topic):
    try:
        ny = wikipedia.page(topic)
        res = str(ny.content[:500].encode('utf-8'))
        return res
    except Exception as e:
        print(e)
        return "Sorry sir"
Ejemplo n.º 16
0
def import_images():
    image_collection.remove(source='wiki')

    wikipedia.set_lang('ru')
    root_page = wikipedia.page('Экспонаты эрмитажа')

    for link in root_page.links:
        import_images_from_page(link)
Ejemplo n.º 17
0
def pageid(query: str) -> str:
    print(query)
    wikipedia.set_lang('en')
    search = wikipedia.search(query)[0]
    return json.dumps(
        filterResult(
            wikipedia.page(
                None, search['pageid']
            )))
Ejemplo n.º 18
0
def wiki(bot, event, *args):
    """
    **Wikipedia:**
    Usage: /wiki <keywords to search for> <optional: sentences to display [defaults to 3]>
    Purpose: Get summary from Wikipedia on keywords.
    """
    from wikipedia import wikipedia, PageError, DisambiguationError

    def summary(self, sentences=3):
        if not getattr(self, '_summary', False):
            query_params = {
                'prop': 'extracts',
                'explaintext': '',
                'exintro': '',
            }
        query_params['exsentences'] = sentences
        if not getattr(self, 'title', None) is None:
            query_params['titles'] = self.title
        else:
            query_params['pageids'] = self.pageid

        request = wikipedia._wiki_request(query_params)
        self._summary = request['query']['pages'][self.pageid]['extract']

        return self._summary

    wikipedia.WikipediaPage.summary = summary
    try:
        sentences = 3
        try:
            if args[-1].isdigit():
                sentences = args[-1]
                args = args[:-1]
            page = wikipedia.page(' '.join(args))
        except DisambiguationError as e:
            page = wikipedia.page(wikipedia.search(e.options[0], results=1)[0])
        segments = [
            hangups.ChatMessageSegment(page.title, hangups.SegmentType.LINK, is_bold=True, link_target=page.url),
            hangups.ChatMessageSegment('\n', hangups.SegmentType.LINE_BREAK),
            hangups.ChatMessageSegment(page.summary(sentences=sentences))]

        bot.send_message_segments(event.conv, segments)
    except PageError:
        bot.send_message(event.conv, "Couldn't find \"{}\". Try something else.".format(' '.join(args)))
Ejemplo n.º 19
0
    def test_find_disease_name_and_link(self):
        wikipedia_client: WikipediaClient = WikipediaClient("en")
        chapter_title: str = "ICD-10 Chapter IV: Endocrine, nutritional and metabolic diseases"
        parsed_response_content: dict = wikipedia_client.search_title(chapter_title)
        icd_disease_group_page_title: str = parsed_response_content["query"]["search"][0]["title"]
        icd_disease_group_page_html: str = str(wikipedia.page(icd_disease_group_page_title).html())
        link, title = HtmlParser.find_disease_name_and_link(icd_disease_group_page_html, "E10.3")

        self.assertEqual("/wiki/Diabetic_retinopathy", link)
        self.assertEqual("Diabetic retinopathy", title)
Ejemplo n.º 20
0
    def _get_icd_chapter_article_page(self, title: str) -> str:
        if title in self.wikipedia_pages_cache:
            return self.wikipedia_pages_cache[title]

        result: str = str(wikipedia.page(title).html())
        self.wikipedia_pages_cache[title] = result
        if len(self.wikipedia_pages_cache) > 4:
            # Removes oldest page in cache
            self.wikipedia_pages_cache.popitem(False)
        return result
Ejemplo n.º 21
0
  def test_disambiguate(self):
    """Test that page raises an error when a disambiguation page is reached."""
    try:
      ram = wikipedia.page("Smith", auto_suggest=False, redirect=False)
      error_raised = False
    except wikipedia.DisambiguationError as e:
      error_raised = True
      options = e.options

    self.assertTrue(error_raised)
    self.assertEqual(options, [u'Dodge Ramcharger', u'Dodge Ram Van', u'Dodge Mini Ram', u'Dodge Caravan C/V', u'Dodge Caravan C/V', u'Ram C/V', u'Dodge Ram 50', u'Dodge D-Series', u'Dodge Rampage', u'Ram (brand)'])
Ejemplo n.º 22
0
 def wikiSearch(self, command):
     reg_ex = re.search(
         'tell me about (.*)',
         command)  # Get the user is trying to search after the keywords
     try:
         if reg_ex:
             topic = reg_ex.group(1)
             wikiResponse = wikipedia.page(topic)
             s.AIResponse(str(wikiResponse.content[:500].encode('utf-8')))
     except Exception as e:
         print(e)
Ejemplo n.º 23
0
 async def w(self, ctx, *, sq: str):
     try:
         s = wikipedia.search(sq, results=1)
     except Exception:
         await ctx.send("I didn't find anything.")
     else:
         try:
             p = wikipedia.page(s)
         except Exception:
             await ctx.send("Hmm. Can you be more a little more specific?")
         else:
             await ctx.send(p.url)
Ejemplo n.º 24
0
def import_images_from_page(title):
    print("Importing from [" + title + "]")
    try:
        p = wikipedia.page(title)
    except wikipedia.PageError as e:
        print("could not load the page: " + str(e))
        return

    query_params = {
        'generator': 'images',
        'gimlimit': 'max',
        'prop': 'imageinfo',
        'iiprop': 'url',
        'titles': p.title,
    }
    try:
        request = wikipedia._wiki_request(**query_params)

        image_keys = request['query']['pages'].keys()
        images = (request['query']['pages'][key] for key in image_keys)
        urls_and_desc = filter(
            lambda x: re.search(r'(?:jpg|jpeg)$', x[0].lower()),
            ((image['imageinfo'][0]['url'], image['imageinfo'][0]['descriptionurl']) for image in images if image.get('imageinfo'))
        )
    except KeyError or URLError as e:
        print("could not load page images: " + str(e))
        return

    processed = set()

    for item in urls_and_desc:
        if item[0] in processed:
            continue

        match = re.search(r'File:(.*?)(?:[0-9]{3})?\.(?:jpg|jpeg)$', unquote(item[1]))

        if match is None:
            continue

        file_title = re.sub(r'[_-]+', ' ', match.group(1)).strip()

        image = Image.create_from_dict({
            'title': file_title,
            'image_url': item[0],
            'description_url': item[1],
            'source': 'wiki',
        })

        image_collection.insert(image)
        processed.add(item[0])
Ejemplo n.º 25
0
 def get(self):
     self.response.headers["Content-Type"] = "text/html; charset=utf-8"
     wikipedia.set_lang(u"ru")
     page = wikipedia.page(u"Проект:Города_России/Списки_улиц/Казани")
     text = page.content
     alphabet = re.split("\n\n\n==\s*...\s*==\n", text)
     byline = []
     for line in alphabet:
         byline.append(re.split("\n", line))
     byline.remove(byline[0])
     # byline[0] - names with number
     # byline[1..]- names with corresponding letter
     for line in byline[0]:
         self.response.write(json.dumps(line))
Ejemplo n.º 26
0
def has_date(item):
    try:
        _page = wikipedia.page(title=None, pageid=item)
        _dt = datetime.strptime(_page.touched, '%Y-%m-%dT%H:%M:%SZ')
        if _dt.year >= 2015:
            _revs = get_revisions(_page.pageid, False)
            _tmpdt = datetime.strptime(_revs[0]['timestamp'],
                                       '%Y-%m-%dT%H:%M:%SZ')
            if _tmpdt.year >= 2015 and _tmpdt.year <= 2017:
                return _page
            else:
                return ""
        else:
            return ""
    except:
        return "Error"
Ejemplo n.º 27
0
def extract_actor_from_wikipedia(lastname, firstname):
    wikipedia.set_lang("fr")

    searchs = wikipedia.search(lastname + " " + firstname)

    for search in searchs:
        page = wikipedia.page(search)
        rc = {"links": list({"title": "wikipedia", "url": page.url})}

        if lastname in page.title and firstname in page.title:
            rc = dict({"links": [], "name": firstname + " " + lastname})
            for img in page.images:
                if img.endswith(".jpg"): rc["photo"] = img

            save_domains = [
                "unifrance.org", "www.lefilmfrancais", "www.allocine.fr",
                "catalogue.bnf.fr", "www.allmovie.com"
            ]
            libs = [
                "UniFrance", "Le Film Francais", "Allocine", "La BNF",
                "All movie"
            ]
            try:
                for ref in page.references:
                    domain = urlparse(ref).netloc
                    try:
                        idx = save_domains.index(domain)
                        rc["links"].append({"title": libs[idx], "url": ref})
                    except:
                        pass
            except:
                pass

            html: wikipedia.BeautifulSoup = wikipedia.BeautifulSoup(
                page.html(), "html5lib")
            #Recherche de la section des films
            # for link in html.findAll('a', attrs={'href': wikipedia.re.compile("^http://")}):
            #     if "film" in link.text:
            #         pass

            rc["summary"] = page.summary
            rc["title"] = page.title
            rc["url"] = page.url

            return rc

    return None
Ejemplo n.º 28
0
def get_articles(lat, lon):
    """
    :type lat: str
    :type lon: str
    :return: list of dicts representing articles
    """

    # Use really large radius, in case very far away from somewhere.
    # Results are sorted by distance and limited so that works fine.
    radius = 20000  # Upper limit
    landmark_articles = wikilocation.articles(lat, lon, radius, 10, "landmark")
    # event_articles = wikilocation.articles(lat, lon, radius, 5, "event")

    if len(landmark_articles) == 0:
        OLD_STREET_ROUNDABOUT = ("51.525603", "-0.087558")
        lat, lon = OLD_STREET_ROUNDABOUT
        landmark_articles = wikilocation.articles(lat, lon, radius, 10,
                                                  "landmark")

    # wikilocation_articles = event_articles + landmark_articles
    # wikilocation_articles = random.sample(wikilocation_articles, 5)
    # wikilocation_articles = _interleave(landmark_articles, event_articles)
    wikilocation_articles = landmark_articles
    wikilocation_articles = _remove_lists(wikilocation_articles)

    articles = []
    for wikilocation_article in wikilocation_articles:

        article = {}

        title = wikilocation_article["title"]
        article["title"] = title

        # first_sentence = wikipedia.summary(title, sentences=1)
        page = wikipedia.page(title)
        # article["first_sentence"] = first_sentence
        article["summary"] = page.summary

        article[
            "image"] = "http://upload.wikimedia.org/wikipedia/commons/3/3c/Stonehenge2007_07_30.jpg"

        article["url"] = page.url

        articles.append(article)

    return articles
Ejemplo n.º 29
0
def get_articles(lat, lon):
    """
    :type lat: str
    :type lon: str
    :return: list of dicts representing articles
    """

    # Use really large radius, in case very far away from somewhere.
    # Results are sorted by distance and limited so that works fine.
    radius = 20000  # Upper limit
    landmark_articles = wikilocation.articles(lat, lon, radius, 10, "landmark")
    # event_articles = wikilocation.articles(lat, lon, radius, 5, "event")

    if len(landmark_articles) == 0:
        OLD_STREET_ROUNDABOUT = ("51.525603", "-0.087558")
        lat, lon = OLD_STREET_ROUNDABOUT
        landmark_articles = wikilocation.articles(lat, lon, radius, 10, "landmark")

    # wikilocation_articles = event_articles + landmark_articles
    # wikilocation_articles = random.sample(wikilocation_articles, 5)
    # wikilocation_articles = _interleave(landmark_articles, event_articles)
    wikilocation_articles = landmark_articles
    wikilocation_articles = _remove_lists(wikilocation_articles)

    articles = []
    for wikilocation_article in wikilocation_articles:

        article = {}

        title = wikilocation_article["title"]
        article["title"] = title

        # first_sentence = wikipedia.summary(title, sentences=1)
        page = wikipedia.page(title)
        # article["first_sentence"] = first_sentence
        article["summary"] = page.summary

        article["image"] = "http://upload.wikimedia.org/wikipedia/commons/3/3c/Stonehenge2007_07_30.jpg"

        article["url"] = page.url

        articles.append(article)

    return articles
Ejemplo n.º 30
0
    def extract(self, request: ExtractorRequest) -> ExtractorResponse:
        try:
            # last url path segment should be our page name, e.g. "Patellar_dislocation"
            page_name = request.url.split("/")[-1]

            page = wikipedia.page(page_name)
            text = page.content
            meta = {
                "source": "wikipedia",
                "source_url": page.url,
                "title": page.title,
                "summary": page.summary,
                "images": page.images,
                "references": page.references,
            }
            # construct response
            response_meta = {**(request.meta or {}), **meta}
            response = ExtractorResponse(meta=response_meta, text=text or "")
        except Exception as e:
            msg = f"Error using wikipedia extractor: {str(e)}"
            log.error(msg)
            response = ExtractorResponse(error=msg)

        return response
Ejemplo n.º 31
0
 def test_redirect_false(self):
   """Test that page raises an error on a redirect when redirect == False."""
   mp = lambda: wikipedia.page("Menlo Park, New Jersey", auto_suggest=False, redirect=False)
   self.assertRaises(wikipedia.RedirectError, mp)
Ejemplo n.º 32
0
	def setUp(self):
		# one of the shortest wikipedia articles that includes images
		self.celtuce = wikipedia.page("Celtuce")
Ejemplo n.º 33
0
 def test_missing(self):
   """Test that page raises a PageError for a nonexistant page."""
   # Callicarpa?
   purpleberry = lambda: wikipedia.page("purpleberry", auto_suggest=False)
   self.assertRaises(wikipedia.PageError, purpleberry)
Ejemplo n.º 34
0
 def test_disambiguation_error_page_function(self):
     with self.assertRaises(Exception) as context:
         wikipedia.page("New York")
     self.assertFalse(
         "wikipedia.exceptions.DisambiguationError: \"{0}\" may refer to: \n{1}"
         in str(context.exception))
Ejemplo n.º 35
0
 def setUp(self):
     # shortest wikipedia articles with images and sections
     self.celtuce = wikipedia.page("Celtuce")
     self.cyclone = wikipedia.page("Tropical Depression Ten (2005)")
     self.great_wall_of_china = wikipedia.page("Great Wall of China")
Ejemplo n.º 36
0
 def test_from_page_id(self):
   """Test loading from a page id"""
   self.assertEqual(self.celtuce, wikipedia.page(pageid=1868108))
Ejemplo n.º 37
0
print 'layers down = ' + str(args.NumLayers)
print 'sentences in = ' + str(args.NumSentences)

if args.NumLayers < 0 or args.NumLayers > 10:
   print 'Too many or too few layers'
   exit()
if args.NumSentences < 1 or args.NumSentences > 10:
   print 'Too many or too few sentences'
   exit()

# randomly choose a page to start at from the list
pageName = choice(SOURCE_PAGE_NAMES)

# get the page
page = wikipedia.page(pageName)

# for each depth to traverse,
   # randomly choose a link to go down
page = traverseDepth(page, args.NumLayers)

# randomly choose a section. Keep trying until you find one that has content
# (they may be empty). Dont try toooo many times
sectionTitle = ''
section = ''
remaining = len(page.sections)
while remaining > 0:
   print '.'
   sectionTitle = choice(page.sections)
   section = page.section(sectionTitle)
Ejemplo n.º 38
0
 def get(self):
     return wikipedia_api.page(self._number)
Ejemplo n.º 39
0
 def setUp(self):
     # shortest wikipedia articles with images and sections
     self.celtuce = wikipedia.page("Celtuce")
     self.cyclone = wikipedia.page("Tropical Depression Ten (2005)")
Ejemplo n.º 40
0
	def test_something3(self):
		x = wikipedia.page('Cathode Rays', auto_suggest=False)
		print x.pageid
		print x.content.encode('utf-8')
Ejemplo n.º 41
0
print('\n')
print("Digging for company info...")

a = len(dive)
b = 0
c = []

for thing in dive:
    b = b + 1
    sys.stdout.write('\r')
    sys.stdout.write('%.0f%% complete' % (b / a * 100, ))
    sys.stdout.flush()
    thing = str(thing)
    # manually storing desired URL
    url1 = wikipedia.page(thing)
    url2 = url1.url
    c.append(url2)
    req = requests.get(url2)
    store = etree.fromstring(req.text)
    output = store.xpath(
        '//*[@id="mw-content-text"]/div/table[1]/tbody/tr[11]')
    #print(output)

print('\n')
for thing in c:
    print(thing)

#'//*[@id="mw-content-text"]/div/table[1]/tbody/tr[17]/th'
#'//*[@id="mw-content-text"]/div/table[1]/tbody/tr[17]/td'
#'//*[@id="mw-content-text"]/div/table[1]/tbody/tr[11]/td/span/text()'
Ejemplo n.º 42
0
def test():
    form = LoginForm()
    if form.validate_on_submit():
        flash(form.openid.data, 'Question')
        text = form.openid.data.lower()
        data = form.openid.data.lower()
        data1 = form.openid.data
        text = text.split()
        negator = [
            'not', 'never', 'not possible', 'does not', 'abort', 'neither',
            'nor', 'no', 'negative', 'negate'
        ]
        assertor = ['may be', 'can be', 'not sure', 'might', 'may']
        preposition = [
            'have', 'is', 'are', 'about', 'above', 'across', 'after',
            'against', 'along', 'among', 'around', 'at', 'before', 'behind',
            'below', 'beneath', 'beside', 'between', 'by', 'down', 'during',
            'except', 'for', 'from', 'front', 'inside', 'instead', 'into',
            'like', 'near', 'of', 'off', 'on', 'onto', 'top', 'out', 'outside',
            'over', 'past', 'since', 'through', 'to', 'toward', 'under',
            'underneath', 'until', 'up', 'upon', 'with', 'within', 'without'
        ]
        wh = [
            'why', 'what', 'how', 'Who', 'whoever', 'whom', 'whomever',
            'whose', 'which'
        ]
        pronoun = [
            'i', 'me', 'you', 'she', 'her', 'he', 'him', 'it', 'we', 'us',
            'you', 'they', 'them', 'my', 'mine', 'your', 'yours', 'hers',
            'his', 'its', 'yours', 'ours', 'theirs', 'myself', 'yourself',
            'himself', 'herself', 'itself', 'all', 'another', 'any', 'anybody',
            'anyone', 'anything', 'both', 'each', 'either', 'everybody',
            'everyone', 'everything', 'few', 'many', 'neither', 'nobody',
            'none', 'nothing', 'one', 'several', 'some', 'somebody', 'someone',
            'something', 'this', 'that', 'these', 'those'
        ]
        # Removing Wh Question
        wh_q = ''
        for ser in text:
            inflag = 0
            for w in wh:
                if w == ser:
                    inflag = 1
            if inflag == 0:
                wh_q = wh_q + ser + ' '

        # Removing Prepostion
        wh_q = wh_q.split()
        prep_q = ''
        for ser in wh_q:
            inflag = 0
            for prep in preposition:
                if ser == prep:
                    inflag = 1
            if inflag == 0:
                prep_q = prep_q + ser + ' '

        # Removing Pronoun
        prep_q = prep_q.split()
        pro_q = ''
        for ser in prep_q:
            inflag = 0
            for pro in pronoun:
                if ser == pro:
                    inflag = 1
            if inflag == 0:
                pro_q = pro_q + ser + ' '

        text = pro_q
        text = text.split()
        data = pro_q.strip()
        flag = 0
        answer = 0
        wikiflag = 0
        ans = 0

        data = ''
        asser = 0
        nege = 0
        posi = 0
        #Assertive Section
        for ser in text:
            inflag = 0
            for ass in assertor:
                if ser == ass and flag == 0 or data.find(
                        ass) != -1 and flag == 0:
                    inflag = 1
                    asser = 1
                    flash('Assertive', 'Answer')
                    flag = 1
            if inflag == 0:
                data = data + ser + ' '
        if asser == 1:
            data = data.strip()
            abc = models.Assertive.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1
                        or a.question.lower().find(data) != -1
                    ) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash(
                    'Answer not in database... Lets search Wikipedia Database',
                    'Answer')
                wikiflag = 1
                #return redirect ('http://www.lmgtfy.com/?q=' + data)
            else:
                finalans = a.answer
                flash(a.answer, 'Answer')

        #Negative Section
        if asser == 0:
            data = ''
        for ser in text:
            inflag = 0
            for neg in negator:
                if ser == neg and flag == 0 or data.find(
                        neg) != -1 and flag == 0:
                    inflag = 1
                    nege = 1
                    flash('Negative', 'Answer')
                    flag = 1
            if inflag == 0:
                data = data + ser + ' '
        if nege == 1:
            data = data.strip()
            abc = models.Negative.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1
                        or a.question.lower().find(data) != -1
                    ) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash(
                    'Answer not in database... Lets search Wikipedia Database',
                    'Answer')
                wikiflag = 1
                #return redirect ('http://www.lmgtfy.com/?q=' + data)
            else:
                finalans = a.answer
                flash(a.answer, 'Answer')

        #Postive Section

        if flag == 0:
            data = form.openid.data.lower()
            flash('Positive', 'Answer')
            abc = models.Positive.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1
                        or a.question.lower().find(data) != -1
                    ) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash(
                    'Answer not in database... Lets search Wikipedia Database',
                    'Answer')
                wikiflag = 1
                #return redirect ('http://www.lmgtfy.com/?q=' + data)
            else:
                finalans = a.answer
                flash(a.answer, 'Answer')

        #Wiki Section
        ans = 0
        if wikiflag == 1:
            abc = models.Wikipedia.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1
                        or a.question.lower().find(data) != -1
                    ) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash(
                    'Answer not in Wikipedia database... Lets search Wikipedia Internet',
                    'Answer')
                ny = wikipedia.search(data)
                if ny == []:
                    return redirect('http://www.lmgtfy.com/?q=' + data1)
                else:
                    try:
                        ny1 = wikipedia.summary(data1,
                                                chars=0,
                                                auto_suggest=True,
                                                redirect=True,
                                                sentences=3)
                        finalans = ny1
                        flash(ny1, 'Answer')
                        ny2 = wikipedia.page(data1)
                        flash('Source: ' + ny2.url, 'Answer')
                        #u = models.Wikipedia(question=data, answer=ny1)
                        #db.session.add(u)
                        #db.session.commit()
                    except Exception as inst:
                        flash(
                            'Your question is either out of scope of very trival for me to answer',
                            'Answer')
                        finalans = 'Your question is either out of scope of very trival for me to answer'
            else:
                finalans = a.answer
                flash(a.answer, 'Answer')
        display = '\n'
        s = models.Chats.query.all()
        for chat in reversed(s):
            flash('Question: ' + chat.question, 'Display')
            flash('Answer: ' + chat.answer, 'Display')
            flash('.', 'Display')
        u = models.Chats(question=data1, answer=finalans)
        db.session.add(u)
        db.session.commit()

        return redirect('/test')
    return render_template("index2.html", title='ChatterBot', form=form)
Ejemplo n.º 43
0
 def test_redirect_with_normalization(self):
   """Test that a page redirect with a normalized query loads correctly"""
   the_party = wikipedia.page("communist Party", auto_suggest=False)
   self.assertIsInstance(the_party, wikipedia.WikipediaPage)
   self.assertEqual(the_party.title, "Communist party")
Ejemplo n.º 44
0
def wiki_response(request_text):
    try:
        txt = str(wikipedia.page(request_text).content[:1000])
    except Exception:
        txt = 'По данному запросу ничего не найдено.'
    return txt
Ejemplo n.º 45
0
def test():
    form = LoginForm()
    if form.validate_on_submit():
        flash(form.openid.data , 'Question')
        text = form.openid.data.lower()
        data = form.openid.data.lower()
        data1 = form.openid.data
        text = text.split()
        negator = ['not', 'never', 'not possible', 'does not', 'abort', 'neither', 'nor', 'no', 'negative', 'negate']
        assertor = ['may be', 'can be', 'not sure', 'might', 'may']
        preposition = ['have', 'is', 'are', 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'by', 'down', 'during', 'except', 'for', 'from', 'front', 'inside', 'instead', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'top', 'out', 'outside', 'over', 'past', 'since', 'through', 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']
        wh = ['why', 'what', 'how', 'Who', 'whoever', 'whom', 'whomever', 'whose', 'which']
        pronoun = ['i', 'me', 'you', 'she', 'her', 'he', 'him', 'it', 'we', 'us', 'you', 'they', 'them', 'my', 'mine', 'your', 'yours', 'hers', 'his', 'its', 'yours', 'ours', 'theirs', 'myself', 'yourself', 'himself', 'herself', 'itself', 'all', 'another', 'any', 'anybody', 'anyone', 'anything', 'both', 'each', 'either', 'everybody', 'everyone', 'everything', 'few', 'many', 'neither', 'nobody', 'none', 'nothing', 'one', 'several', 'some', 'somebody', 'someone', 'something', 'this', 'that', 'these', 'those']
        # Removing Wh Question
        wh_q=''
        for ser in text:
            inflag = 0
            for w in wh:
                if w == ser:
                    inflag = 1
            if inflag == 0:
                wh_q = wh_q + ser + ' '


        # Removing Prepostion
        wh_q = wh_q.split()
        prep_q = ''
        for ser in wh_q:
            inflag = 0
            for prep in preposition:
                if ser == prep:
                    inflag = 1
            if inflag == 0:
                prep_q = prep_q + ser + ' '


        # Removing Pronoun
        prep_q = prep_q.split()
        pro_q = ''
        for ser in prep_q:
            inflag = 0
            for pro in pronoun:
                if ser == pro:
                    inflag = 1
            if inflag == 0:
                pro_q = pro_q + ser + ' '

        text = pro_q
        text = text.split()
        data = pro_q.strip()
        flag = 0
        answer = 0
        wikiflag = 0
        ans = 0

        data = ''
        asser = 0
        nege = 0
        posi = 0
        #Assertive Section
        for ser in text:
            inflag = 0
            for ass in assertor:
                if ser == ass and flag == 0 or data.find(ass) != -1 and flag == 0:
                    inflag = 1
                    asser = 1
                    flash('Assertive', 'Answer')
                    flag=1
            if inflag == 0:
                data = data + ser + ' '
        if asser == 1:
            data = data.strip()
            abc = models.Assertive.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                wikiflag = 1
                #return redirect ('http://www.lmgtfy.com/?q=' + data)
            else:
                finalans=a.answer
                flash(a.answer, 'Answer')

        #Negative Section
        if asser == 0:
            data = '' 
        for ser in text:
            inflag = 0           
            for neg in negator:
                if ser == neg and flag == 0 or data.find(neg) != -1 and flag == 0:
                    inflag = 1
                    nege = 1
                    flash('Negative', 'Answer')
                    flag = 1
            if inflag == 0:
                data = data + ser + ' '
        if nege == 1:
            data = data.strip()
            abc = models.Negative.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                wikiflag = 1
                #return redirect ('http://www.lmgtfy.com/?q=' + data)
            else:
                finalans=a.answer
                flash(a.answer, 'Answer')

        #Postive Section

        if flag == 0:
            data = form.openid.data.lower()
            flash('Positive', 'Answer')
            abc = models.Positive.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                wikiflag = 1
                #return redirect ('http://www.lmgtfy.com/?q=' + data)
            else:
                finalans=a.answer
                flash(a.answer, 'Answer')

        #Wiki Section
        ans = 0
        if wikiflag == 1:
            abc = models.Wikipedia.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash('Answer not in Wikipedia database... Lets search Wikipedia Internet', 'Answer')
                ny = wikipedia.search(data)
                if ny == []:
                    return redirect ('http://www.lmgtfy.com/?q=' + data1)
                else:
                    try:
                        ny1 = wikipedia.summary(data1, chars=0, auto_suggest=True, redirect=True, sentences=3)
                        finalans=ny1
                        flash(ny1, 'Answer')
                        ny2 = wikipedia.page(data1)
                        flash('Source: '+ ny2.url, 'Answer')
                        #u = models.Wikipedia(question=data, answer=ny1)
                        #db.session.add(u)
                        #db.session.commit()
                    except Exception as inst:
                        flash('Your question is either out of scope of very trival for me to answer', 'Answer')
                        finalans = 'Your question is either out of scope of very trival for me to answer'
            else:
                finalans=a.answer
                flash(a.answer, 'Answer')
        display = '\n'
        s = models.Chats.query.all()
        for chat in reversed(s):
            flash('Question: ' + chat.question, 'Display')
            flash('Answer: ' + chat.answer , 'Display')
            flash('.', 'Display')
        u = models.Chats(question=data1, answer=finalans)
        db.session.add(u)
        db.session.commit() 

        return redirect('/test')
    return render_template("index2.html",
        title = 'ChatterBot',
        form = form)
Ejemplo n.º 46
0
def test():
    form = LoginForm()
    if form.validate_on_submit():
        flash(form.openid.data , 'Question')
        text = form.openid.data.lower()
        data = form.openid.data.lower() # for processing of answer(data mining)
        data1 = form.openid.data # for finding verbs nouns adjectives and number
        text = text.split() # for finding positive negative and assertive

        # Finding Nouns
        tokenized = nltk.word_tokenize(data1)
        p = nltk.pos_tag(tokenized)
        flash(p, 'Answer')
        name = nltk.ne_chunk(p, binary=True)
        ent = re.findall(r'NE\s(.*?)/', str(name))
        chunkGram = r"""Noun: {<NN\w?>} """
        chunkParser = nltk.RegexpParser(chunkGram)
        NNnoun = chunkParser.parse(p)
        ip_noun = re.findall(r'Noun\s(.*?)/', str(NNnoun))
        #noun = re.findall(r'<NN\w?>*', str(p))
        #print ent
        #nouns = ''
        #for n in ip_noun:
        #    nouns = nouns + n + ' '
        #flash ('Nouns: ' + str(nouns), 'Answer')
        flash ('Nouns list: ' + str(ip_noun), 'Answer')

        # Finding Verbs
        tokenized = nltk.word_tokenize(data1)
        p = nltk.pos_tag(tokenized)
        name = nltk.ne_chunk(p, binary=True)

        chunkGram = r"""Verb: {<VB\w?>} """
        chunkParser = nltk.RegexpParser(chunkGram)
        VBverb = chunkParser.parse(p)
        ip_verb = re.findall(r'Verb\s(.*?)/', str(VBverb))
        #noun = re.findall(r'<NN\w?>*', str(p))
        #print ent
        #verbs = ''
        #for v in ip_verb:
        #    verbs = verbs + v + ' '
        #flash ('Verbs: ' + str(verbs), 'Answer')
        flash ('Verb List: ' + str(ip_verb), 'Answer')

        # Finding Adjective
        tokenized = nltk.word_tokenize(data1)
        p = nltk.pos_tag(tokenized)
        name = nltk.ne_chunk(p, binary=True)

        chunkGram = r"""Verb: {<JJ\w?>} """
        chunkParser = nltk.RegexpParser(chunkGram)
        JJAdj = chunkParser.parse(p)
        ip_adj = re.findall(r'Verb\s(.*?)/', str(JJAdj))
        #noun = re.findall(r'<NN\w?>*', str(p))
        #print ent
        #adjs = ''
        #for a in ip_adj:
        #    adjs = adjs + a + ' '
        #flash ('Ajectives: ' + str(adjs), 'Answer')
        flash ('Adjective list: ' + str(ip_adj), 'Answer')

        # Finding Numbers
        tokenized = nltk.word_tokenize(data1)
        p = nltk.pos_tag(tokenized)
        name = nltk.ne_chunk(p, binary=True)
        chunkGram = r"""Number: {<CD\w?>} """
        chunkParser = nltk.RegexpParser(chunkGram)
        CDNumber = chunkParser.parse(p)
        ip_number = re.findall(r'Number\s(.*?)/', str(CDNumber))
        flash ('Number list: ' + str(ip_number), 'Answer')

        max_check = len(ip_noun) + len(ip_verb) + len(ip_adj) + len(ip_number) #counting the number of max hits

        # Similar Noun Form
        simi = models.Similar.query.all()
        count_n = len(ip_noun)
        max_n = 0
        for noun_sim in ip_noun:
            for sim in simi:
                if sim.word1 == noun_sim:
                    ip_noun.append(str(sim.word2))
                    ip_noun.append(str(sim.word3))
                if sim.word2 == noun_sim:
                    ip_noun.append(str(sim.word1))
                    ip_noun.append(str(sim.word3))
                if sim.word3 == noun_sim:
                    ip_noun.append(str(sim.word1))
                    ip_noun.append(str(sim.word2))
            max_n = max_n + 1
            if max_n >= count_n:
                break


        # Similar Verb Form
        simi = models.Similar.query.all()
        count_v = len(ip_verb)
        max_v = 0
        for verb_sim in ip_verb:
            for sim in simi:
                if sim.word1 == verb_sim:
                    ip_verb.append(str(sim.word2))
                    ip_verb.append(str(sim.word3))
                if sim.word2 == verb_sim:
                    ip_verb.append(str(sim.word1))
                    ip_verb.append(str(sim.word3))
                if sim.word3 == verb_sim:
                    ip_verb.append(str(sim.word1))
                    ip_verb.append(str(sim.word2))
            max_v = max_v + 1
            if max_v >= count_v:
                break

        # Similar Adjective Form
        simi = models.Similar.query.all()
        count_a = len(ip_adj)
        max_a = 0
        for adj_sim in ip_adj:
            for sim in simi:
                if sim.word1 == adj_sim:
                    ip_adj.append(str(sim.word2))
                    ip_adj.append(str(sim.word3))
                if sim.word2 == adj_sim:
                    ip_adj.append(str(sim.word1))
                    ip_adj.append(str(sim.word3))
                if sim.word3 == adj_sim:
                    ip_adj.append(str(sim.word1))
                    ip_adj.append(str(sim.word2))
            max_a = max_a + 1
            if max_a >= count_a:
                break

        #Printing the new appended list        
        flash ('Nouns list: ' + str(ip_noun), 'Answer')
        flash ('Verb List: ' + str(ip_verb), 'Answer')
        flash ('Adjective list: ' + str(ip_adj), 'Answer')
        flash ('Number list: ' + str(ip_number), 'Answer')

        ip_total = ip_noun + ip_verb + ip_adj + ip_number
        ip_total = list(set(ip_total))

        negator = ['not', 'never', 'not possible', 'does not', 'abort', 'neither', 'nor', 'negative', 'negate', 'can\'t', 'doesn\'t','can not','cant','doesnt','dont','don\'t']
        assertor = ['may be', 'can be', 'not sure', 'might', 'may']
        '''preposition = ['have', 'is', 'are', 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'by', 'down', 'during', 'except', 'for', 'from', 'front', 'inside', 'instead', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'top', 'out', 'outside', 'over', 'past', 'since', 'through', 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']
        wh = ['why', 'what', 'how', 'Who', 'whoever', 'whom', 'whomever', 'whose', 'which']
        pronoun = ['i', 'me', 'you', 'she', 'her', 'he', 'him', 'it', 'we', 'us', 'you', 'they', 'them', 'my', 'mine', 'your', 'yours', 'hers', 'his', 'its', 'yours', 'ours', 'theirs', 'myself', 'yourself', 'himself', 'herself', 'itself', 'all', 'another', 'any', 'anybody', 'anyone', 'anything', 'both', 'each', 'either', 'everybody', 'everyone', 'everything', 'few', 'many', 'neither', 'nobody', 'none', 'nothing', 'one', 'several', 'some', 'somebody', 'someone', 'something', 'this', 'that', 'these', 'those']
        # Removing Wh Question
        wh_q=''
        for ser in text:
            inflag = 0
            for w in wh:
                if w == ser:
                    inflag = 1
            if inflag == 0:
                wh_q = wh_q + ser + ' '


        # Removing Prepostion
        wh_q = wh_q.split()
        prep_q = ''
        for ser in wh_q:
            inflag = 0
            for prep in preposition:
                if ser == prep:
                    inflag = 1
            if inflag == 0:
                prep_q = prep_q + ser + ' '


        # Removing Pronoun
        prep_q = prep_q.split()
        pro_q = ''
        for ser in prep_q:
            inflag = 0
            for pro in pronoun:
                if ser == pro:
                    inflag = 1
            if inflag == 0:
                pro_q = pro_q + ser + ' '

        text = pro_q
        text = text.split()
        data = pro_q.strip()
        
        '''
        flag = 0
        answer = 0
        wikiflag = 0
        ans = 0
        asser = 0
        nege = 0
        posi = 0
        #Assertive Section
        for ser in text:
            for ass in assertor:
                if ser == ass and flag == 0 or data.find(ass) != -1 and flag == 0:
                    asser = 1
                    flash('Assertive', 'Answer')
                    flag=1
        if asser == 1:
            display_ans = ''
            max_value = int(max_check * 0.8 + 0.5) # counting the no of hits
            abc = models.Positive.query.all()
            for a in abc:
                # Noun
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                ent = re.findall(r'NE\s(.*?)/', str(name))
                chunkGram = r"""Noun: {<NN\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                NNnoun = chunkParser.parse(p)
                db_noun = re.findall(r'Noun\s(.*?)/', str(NNnoun))

                # Verbs
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                chunkGram = r"""Verb: {<VB\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                VBverb = chunkParser.parse(p)
                db_verb = re.findall(r'Verb\s(.*?)/', str(VBverb))

                # Adjective
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                chunkGram = r"""Verb: {<JJ\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                JJAdj = chunkParser.parse(p)
                db_adj = re.findall(r'Verb\s(.*?)/', str(JJAdj))


                # Number
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                chunkGram = r"""Number: {<CD\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                CDNumber = chunkParser.parse(p)
                db_number = re.findall(r'Number\s(.*?)/', str(CDNumber))

                db_total = db_noun + db_adj + db_verb + db_number
                db_total = list(set(db_total))

                count = 0
                for ip in ip_total:
                    for dbs in db_total:
                        db_plural = re.escape(dbs) + 's?'
                        ip_plural = re.escape(ip) + 's?'
                        if re.match(db_plural, ip,flags=0|re.IGNORECASE):
                            count = count + 1
                        if re.match(ip_plural,dbs,flags=0|re.IGNORECASE):
                            count = count + 1
                        if ip == dbs:
                            count = count - 1

                if max_value < count:
                    display_ans = a.answer
                    max_value = count

            if display_ans == '':
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                wikiflag = 1
            else:
                extra = 'Please be more sure about the problem you are facing so that we can provide you with precise answers. According to me the most relevant solution to your problem is: '
                display_ans = extra + '\n' + display_ans
                flash(display_ans, 'Answer')

             
            """for a in abc:
                if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1

            if answer == 0:
                flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                wikiflag = 1
                #return redirect ('http://www.lmgtfy.com/?q=' + data)
            else:
                finalans=a.answer
                flash(a.answer, 'Answer')"""

        #Negative Section
        if asser != 1:
            for ser in text:          
                for neg in negator:
                    if ser == neg and flag == 0 or data.find(neg) != -1 and flag == 0:
                        nege = 1
                        flash('Negative', 'Answer')
                        flag = 1
            if nege == 1:
                display_ans = ''
                max_value = int(max_check * 0.8 + 0.5) # counting the no of hits
                abc = models.Negative.query.all()
                for a in abc:
                    # Noun
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    ent = re.findall(r'NE\s(.*?)/', str(name))
                    chunkGram = r"""Noun: {<NN\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    NNnoun = chunkParser.parse(p)
                    db_noun = re.findall(r'Noun\s(.*?)/', str(NNnoun))

                    # Verbs
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    chunkGram = r"""Verb: {<VB\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    VBverb = chunkParser.parse(p)
                    db_verb = re.findall(r'Verb\s(.*?)/', str(VBverb))

                    # Adjective
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    chunkGram = r"""Verb: {<JJ\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    JJAdj = chunkParser.parse(p)
                    db_adj = re.findall(r'Verb\s(.*?)/', str(JJAdj))

                   # Number
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    chunkGram = r"""Number: {<CD\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    CDNumber = chunkParser.parse(p)
                    db_number = re.findall(r'Number\s(.*?)/', str(CDNumber))

                    db_total = db_noun + db_adj + db_verb + db_number
                    db_total = list(set(db_total))

                    count = 0
                    for ip in ip_total:
                        for dbs in db_total:
                            db_plural = re.escape(dbs) + 's?'
                            ip_plural = re.escape(ip) + 's?'
                            if re.match(db_plural, ip,flags=0|re.IGNORECASE):
                                count = count + 1
                            if re.match(ip_plural,dbs,flags=0|re.IGNORECASE):
                                count = count + 1
                            if ip == dbs:
                                count = count - 1

                    if max_value < count:
                        display_ans = a.answer
                        max_value = count

                if display_ans == '':
                    answer = 0
                else:
                    answer = 1

                if answer == 0:
                    flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                    wikiflag = 1
                else:
                    flash(display_ans, 'Answer')


                """for a in abc:
                    if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                        ans = 1
                        break
                if ans == 0:
                    answer = 0
                else:
                    answer = 1

                if answer == 0:
                    flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                    wikiflag = 1
                    #return redirect ('http://www.lmgtfy.com/?q=' + data)
                else:
                    finalans=a.answer
                    flash(a.answer, 'Answer')"""

        #Postive Section
        if asser != 1 and nege != 1:
            if flag == 0:
                data = form.openid.data.lower()
                flash('Positive', 'Answer')
                flag = 1
                display_ans = ''
                max_value = int(max_check * 0.8 + 0.5) # counting the no of hits
                abc = models.Positive.query.all()
                for a in abc:
                    # Noun
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    ent = re.findall(r'NE\s(.*?)/', str(name))
                    chunkGram = r"""Noun: {<NN\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    NNnoun = chunkParser.parse(p)
                    db_noun = re.findall(r'Noun\s(.*?)/', str(NNnoun))

                    # Verbs
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    chunkGram = r"""Verb: {<VB\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    VBverb = chunkParser.parse(p)
                    db_verb = re.findall(r'Verb\s(.*?)/', str(VBverb))

                    # Adjective
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    chunkGram = r"""Verb: {<JJ\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    JJAdj = chunkParser.parse(p)
                    db_adj = re.findall(r'Verb\s(.*?)/', str(JJAdj))

                    # Number
                    tokenized = nltk.word_tokenize(a.question)
                    p = nltk.pos_tag(tokenized)
                    name = nltk.ne_chunk(p, binary=True)
                    chunkGram = r"""Number: {<CD\w?>} """
                    chunkParser = nltk.RegexpParser(chunkGram)
                    CDNumber = chunkParser.parse(p)
                    db_number = re.findall(r'Number\s(.*?)/', str(CDNumber))

                    db_total = db_noun + db_adj + db_verb + db_number
                    db_total = list(set(db_total))

                    count = 0
                    for ip in ip_total:
                        for dbs in db_total:
                            db_plural = re.escape(dbs) + 's?'
                            ip_plural = re.escape(ip) + 's?'
                            if re.match(db_plural, ip,flags=0|re.IGNORECASE):
                                count = count + 1
                            if re.match(ip_plural,dbs,flags=0|re.IGNORECASE):
                                count = count + 1
                            if ip == dbs:
                                count = count - 1

                    if max_value < count:
                        display_ans = a.answer
                        max_value = count

                if display_ans == '':
                    answer = 0
                else:
                    answer = 1

                if answer == 0:
                    flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                    wikiflag = 1
                else:
                    flash(display_ans, 'Answer')

                """abc = models.Positive.query.all()
                for a in abc:
                    if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                        ans = 1
                        break
                if ans == 0:
                    answer = 0
                else:
                    answer = 1

                if answer == 0:
                    flash('Answer not in database... Lets search Wikipedia Database', 'Answer')
                    wikiflag = 1
                    #return redirect ('http://www.lmgtfy.com/?q=' + data)
                else:
                    finalans=a.answer
                    flash(a.answer, 'Answer')"""

        #Wiki Section
        ans = 0
        if wikiflag == 1:

            display_ans = ''
            max_value = int(max_check * 0.8 + 0.5) # counting the no of hits
            abc = models.Wikipedia.query.all()
            for a in abc:
                # Noun
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                ent = re.findall(r'NE\s(.*?)/', str(name))
                chunkGram = r"""Noun: {<NN\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                NNnoun = chunkParser.parse(p)
                db_noun = re.findall(r'Noun\s(.*?)/', str(NNnoun))

                # Verbs
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                chunkGram = r"""Verb: {<VB\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                VBverb = chunkParser.parse(p)
                db_verb = re.findall(r'Verb\s(.*?)/', str(VBverb))

                # Adjective
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                chunkGram = r"""Verb: {<JJ\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                JJAdj = chunkParser.parse(p)
                db_adj = re.findall(r'Verb\s(.*?)/', str(JJAdj))

                # Number
                tokenized = nltk.word_tokenize(a.question)
                p = nltk.pos_tag(tokenized)
                name = nltk.ne_chunk(p, binary=True)
                chunkGram = r"""Number: {<CD\w?>} """
                chunkParser = nltk.RegexpParser(chunkGram)
                CDNumber = chunkParser.parse(p)
                db_number = re.findall(r'Number\s(.*?)/', str(CDNumber))

                db_total = db_noun + db_adj + db_verb + db_number
                db_total = list(set(db_total))

                count = 0
                for ip in ip_total:
                    for dbs in db_total:
                        db_plural = re.escape(dbs) + 's?'
                        ip_plural = re.escape(ip) + 's?'
                        if re.match(db_plural, ip,flags=0|re.IGNORECASE):
                            count = count + 1
                        if re.match(ip_plural,dbs,flags=0|re.IGNORECASE):
                            count = count + 1
                        if ip == dbs:
                            count = count - 1

                if max_value < count:
                    display_ans = a.answer
                    max_value = count

            if display_ans == '':
                answer = 0
            else:
                answer = 1

            """abc = models.Wikipedia.query.all()
            for a in abc:
                if (data.find(a.question.lower()) != -1 or a.question.lower().find(data) != -1) and len(data) >= 4:
                    ans = 1
                    break
            if ans == 0:
                answer = 0
            else:
                answer = 1"""

            if answer == 0:
                flash('Answer not in Wikipedia database... Lets search Wikipedia Internet', 'Answer')
                ny = wikipedia.search(data)
                if ny == []:
                    return redirect ('http://www.lmgtfy.com/?q=' + data1)
                else:
                    try:
                        ny1 = wikipedia.summary(data1, chars=0, auto_suggest=True, redirect=True, sentences=3)
                        max_value = int(max_check * 0.8 + 0.5)
                        ip_wiki = ny1.encode('ascii','ignore')
                        # Noun
                        tokenized = nltk.word_tokenize(ip_wiki)
                        p = nltk.pos_tag(tokenized)
                        name = nltk.ne_chunk(p, binary=True)
                        ent = re.findall(r'NE\s(.*?)/', str(name))
                        chunkGram = r"""Noun: {<NN\w?>} """
                        chunkParser = nltk.RegexpParser(chunkGram)
                        NNnoun = chunkParser.parse(p)
                        db_noun = re.findall(r'Noun\s(.*?)/', str(NNnoun))

                        # Verbs
                        tokenized = nltk.word_tokenize(ip_wiki)
                        p = nltk.pos_tag(tokenized)
                        name = nltk.ne_chunk(p, binary=True)
                        chunkGram = r"""Verb: {<VB\w?>} """
                        chunkParser = nltk.RegexpParser(chunkGram)
                        VBverb = chunkParser.parse(p)
                        db_verb = re.findall(r'Verb\s(.*?)/', str(VBverb))

                        # Adjective
                        tokenized = nltk.word_tokenize(ip_wiki)
                        p = nltk.pos_tag(tokenized)
                        name = nltk.ne_chunk(p, binary=True)
                        chunkGram = r"""Verb: {<JJ\w?>} """
                        chunkParser = nltk.RegexpParser(chunkGram)
                        JJAdj = chunkParser.parse(p)
                        db_adj = re.findall(r'Verb\s(.*?)/', str(JJAdj))

                        # Number
                        tokenized = nltk.word_tokenize(ip_wiki)
                        p = nltk.pos_tag(tokenized)
                        name = nltk.ne_chunk(p, binary=True)
                        chunkGram = r"""Number: {<CD\w?>} """
                        chunkParser = nltk.RegexpParser(chunkGram)
                        CDNumber = chunkParser.parse(p)
                        db_number = re.findall(r'Number\s(.*?)/', str(CDNumber))

                        db_total = db_noun + db_adj + db_verb + db_number
                        db_total = list(set(db_total))

                        count = 0
                        for ip in ip_total:
                            for dbs in db_total:
                                db_plural = re.escape(dbs) + 's?'
                                ip_plural = re.escape(ip) + 's?'
                                if re.match(db_plural, ip,flags=0|re.IGNORECASE):
                                    count = count + 1
                                if re.match(ip_plural,dbs,flags=0|re.IGNORECASE):
                                    count = count + 1
                                if ip == dbs:
                                    count = count - 1

                        if max_value <= count:
                            display_ans = ny1

                        if display_ans == '':
                            answer = 0
                        else:
                            answer = 1

                        if answer == 0:
                            flash('Answer not precise in wikipedia Interet', 'Answer')
                            flash(ny1, 'Answer')
                            wikiflag = 1
                        else:
                            display_ans=ny1
                            flash(ny1, 'Answer')
                            ny2 = wikipedia.page(data1)
                            flash('Source: '+ ny2.url, 'Answer')
                            #u = models.Wikipedia(question=data1, answer=ny1)
                            #db.session.add(u)
                            #db.session.commit()
                    except Exception as inst:
                        flash('Your question is either out of scope of very trival for me to answer  ' + str(inst), 'Answer')
                        display_ans = 'Your question is either out of scope of very trival for me to answer'
            else:
                flash(display_ans, 'Answer')
        #s = models.Chats.query.all()
        #for chat in reversed(s):
            #flash('Question: ' + chat.question, 'Display')
            #flash('Answer: ' + chat.answer , 'Display')
            #flash('.', 'Display')
        #u = models.Chats(question=data1, answer=display_ans)
        #db.session.add(u)
        #db.session.commit() 
        return redirect('/test')
    return render_template("index2.html",
        title = 'ChatterBot',
        form = form)
Ejemplo n.º 47
0
 def setUp(self):
   # shortest wikipedia articles with images and sections
   self.celtuce = wikipedia.page("Celtuce")
   self.cyclone = wikipedia.page("Tropical Depression Ten (2005)")
   self.great_wall_of_china = wikipedia.page("Great Wall of China")
Ejemplo n.º 48
0
 def test_redirect_false(self):
     """Test that page raises an error on a redirect when redirect == False."""
     mp = lambda: wikipedia.page(
         "Menlo Park, New Jersey", auto_suggest=False, redirect=False)
     self.assertRaises(wikipedia.RedirectError, mp)
Ejemplo n.º 49
0
 def test_redirect_with_normalization(self):
     """Test that a page redirect with a normalized query loads correctly"""
     the_party = wikipedia.page("communist Party", auto_suggest=False)
     self.assertIsInstance(the_party, wikipedia.WikipediaPage)
     self.assertEqual(the_party.title, "Communist party")
Ejemplo n.º 50
0
	def test_something_else(self):
		print wikipedia.page('Menlo Park, New Jersey', auto_suggest=False)
Ejemplo n.º 51
0
 def test_from_page_id(self):
     """Test loading from a page id"""
     self.assertEqual(self.celtuce, wikipedia.page(pageid=1868108))
Ejemplo n.º 52
0
        if isinstance(claims, dict) :
            for super_type in claims.get('P31',[]):
                super_type_value = super_type['mainsnak']['datavalue']['value']['numeric-id']
                if super_type_value in (571,191067,35760,7725634):
                    return True
                else:
                    item_value= u'Q{}'.format(super_type_value)
                    item_label = dictionary_lookup(item_value)
        else:
            print u'non-dict claims in {}'.format(wd_id)
    except ValueError:
        return False    

pgmatch=re.compile(r"gutenberg.org/(etext|ebooks|files)/(\d+)")
# this file will contain candidate pages. some of these will not be exactly what we want.
fname='./metadata/pg-wikipedia-candidates.txt'
with open(fname,'w') as f:
    for result in embeds('Gutenberg', results=10000):
        wd_id = get_wikidata_id(result['title'])
        if is_book(wd_id):
            pg_ids=[]
            page = wikipedia.page(pageid=result['pageid'])
            for match in pgmatch.findall(page.html()):
                if match[1] not in pg_ids:
                    pg_ids.append( match[1])
                    line = u'{}\t{}\t{}'.format(result['title'],wd_id, int(match[1]))
                    print line
                    f.write(line.encode('UTF-8'))
                    f.write('\r')

Ejemplo n.º 53
0
 def test_missing(self):
     """Test that page raises a PageError for a nonexistant page."""
     # Callicarpa?
     purpleberry = lambda: wikipedia.page("purpleberry", auto_suggest=False)
     self.assertRaises(wikipedia.PageError, purpleberry)
Ejemplo n.º 54
0
def geo(lat: float, lon: float) -> str:
    wikipedia.set_lang('en')
    return json.dumps(
        filterResult(
            wikipedia.page(
                wikipedia.geosearch(lat, lon))))
Ejemplo n.º 55
0
 def setUp(self):
     # one of the shortest wikipedia articles that includes images
     self.celtuce = wikipedia.page("Celtuce")
Ejemplo n.º 56
0
	def test_something(self):
		print wikipedia.page('Carl D. Anderson', auto_suggest=False)