Python title_correct Exemples, lib.textutil.title_correct Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : google.py Projet : Mukosame/SoPaper

def search(ctx):
    query = ctx.query.lower()

    ret = {}
    ret['ctx_update'] = {}
    srs = []

    headers = { 'Hostname': 'www.google.com',
                'User-Agent': ukconfig.USER_AGENT,
                'Accept-Encoding': 'gzip'
              }
    r = requests.get(GOOGLE_URL.format(query), headers=headers, verify=True)
    text = r.text.encode('utf-8')
    #with open('/tmp/a.html', 'r') as f:
        ##f.write(text)
        #text = f.read()

    def find_citecnt(dom):
        try:
            find = dom.findAll(attrs={'class': 'f slp'})[0]
            find = find.findAll('a')[0].text
            citecnt = re.search('[0-9]+', find).group()
            return int(citecnt)
        except:
            return None

    soup = BeautifulSoup(text, BS_PARSER)
    results = soup.findAll(attrs={'class': 'g'})
    for rst in results:
        try:
            h3 = rst.findAll('h3')
            if not h3:  # frame search, e.g. picture/video/kg
                continue
            real_title = h3[0].get_text()
            tc = title_correct(query, real_title)
            if not tc[0]:
                continue
            # TODO do some title update?
            cnt = find_citecnt(rst)
            if cnt is not None:
                ret['ctx_update']['citecnt'] = cnt
            #findpdf = rst.findAll(attrs={'class': 'mime'})
            findpdf = rst.findAll('span')
            if findpdf and findpdf[0].text == '[PDF]':
                pdflink = rst.findAll('a')[0].get('href')
                try:
                    url = parse_google_link(pdflink)
                except:
                    continue
                srs.append(SearchResult('directpdf', url))
            else:
                url = rst.findAll('a')[0].get('href')
                try:
                    url = parse_google_link(url)
                except:
                    continue
                srs.append(SearchResult(None, url))
        except Exception as e:
            log_exc("Search Item parse error: {0}".format(str(e)))
    ret['results'] = srs
    return ret

Exemple #2

0

Afficher le fichier

Fichier : gscholar.py Projet : Mukosame/SoPaper

def search(ctx):
    query = ctx.query.lower()

    ret = {}
    ret['ctx_update'] = {}
    srs = []

    r = requests.get(GOOGLE_SCHOLAR_URL.format(query))
    text = r.text.encode('utf-8')
    #with open('/tmp/b.html', 'r') as f:
        #text = f.read()

    def find_citecnt(dom):
        try:
            find = dom.findAll(attrs={'class': 'gs_ri'})[0]
            find = find.findAll(attrs={'class': 'gs_fl'})[0]
            find = find.findAll('a')[0].text
            cnt = re.search('[0-9]+', find).group()
            return int(cnt)
        except:
            return None


    soup = BeautifulSoup(text, BS_PARSER)
    results = soup.findAll(attrs={'class': 'gs_r'})
    title_updated = None
    for rst in results:
        try:
            h3 = rst.findAll('h3')[0]
            real_title = h3.get_text()
            real_title = filter_title_fileformat(real_title)
            tc = title_correct(query, real_title)
            if not tc[0]:
                continue
            if not title_updated and tc[1]:
                title_updated = ensure_unicode(title_beautify(real_title))
                while True:     # fix things like '[citation][c] Title'
                    new_title = re.sub('^\[[^\]]*\]', '', title_updated).strip()
                    if new_title == title_updated:
                        title_updated = new_title
                        break
                    title_updated = new_title
                log_info(u"Title updated: {0}".format(title_updated))
                ret['ctx_update']['title'] = title_updated

            cnt = find_citecnt(rst)
            if cnt is not None:
                ret['ctx_update']['citecnt'] = cnt

            try:
                url = str(h3.find('a').get('href'))
                srs.append(SearchResult(None, url))
            except:
                pass

            findpdf = rst.findAll(attrs={'class': 'gs_ggs'})
            if findpdf:
                pdflink = findpdf[0].find('a').get('href')
                url = str(pdflink)
                srs.append(SearchResult('directpdf', url))
        except Exception as e:
            log_exc("Search Item parse error: {0}".format(str(e)))
    ret['results'] = srs
    return ret

Exemple #3

0

Afficher le fichier

Fichier : google.py Projet : Mukosame/SoPaper

def search(ctx):
    query = ctx.query.lower()

    ret = {}
    ret['ctx_update'] = {}
    srs = []

    headers = {
        'Hostname': 'www.google.com',
        'User-Agent': ukconfig.USER_AGENT,
        'Accept-Encoding': 'gzip'
    }
    r = requests.get(GOOGLE_URL.format(query), headers=headers, verify=True)
    text = r.text.encode('utf-8')

    #with open('/tmp/a.html', 'r') as f:
    ##f.write(text)
    #text = f.read()

    def find_citecnt(dom):
        try:
            find = dom.findAll(attrs={'class': 'f slp'})[0]
            find = find.findAll('a')[0].text
            citecnt = re.search('[0-9]+', find).group()
            return int(citecnt)
        except:
            return None

    soup = BeautifulSoup(text, BS_PARSER)
    results = soup.findAll(attrs={'class': 'g'})
    for rst in results:
        try:
            h3 = rst.findAll('h3')
            if not h3:  # frame search, e.g. picture/video/kg
                continue
            real_title = h3[0].get_text()
            tc = title_correct(query, real_title)
            if not tc[0]:
                continue
            # TODO do some title update?
            cnt = find_citecnt(rst)
            if cnt is not None:
                ret['ctx_update']['citecnt'] = cnt
            #findpdf = rst.findAll(attrs={'class': 'mime'})
            findpdf = rst.findAll('span')
            if findpdf and findpdf[0].text == '[PDF]':
                pdflink = rst.findAll('a')[0].get('href')
                try:
                    url = parse_google_link(pdflink)
                except:
                    continue
                srs.append(SearchResult('directpdf', url))
            else:
                url = rst.findAll('a')[0].get('href')
                try:
                    url = parse_google_link(url)
                except:
                    continue
                srs.append(SearchResult(None, url))
        except Exception as e:
            log_exc("Search Item parse error: {0}".format(str(e)))
    ret['results'] = srs
    return ret