Example #1
0
    def add_doc(self, doc):
        """ doc: a dict """
        content = doc['text']
        document = UnprocessedDocument()
        document.fields.append(Field('text', content))

        for k, v in doc.iteritems():
            if k in ['text', 'id']:
                continue
            if type(v) == list:
                for item in v:
                    document.fields.append(Field(k, ensure_unicode(item)))
            else:
                document.fields.append(Field(k, ensure_unicode(v)))
        document.id = str(doc['id'])
        try:
            self.lock.acquire()
            self.dbconn.add(document)
        except errors.IndexerError as e:
            print str(e)
        finally:
            self.lock.release()
Example #2
0
    def add_doc(self, doc):
        """ doc: a dict """
        content = doc['text']
        document = UnprocessedDocument()
        document.fields.append(Field('text', content))

        for k, v in doc.iteritems():
            if k in ['text', 'id']:
                continue
            if type(v) == list:
                for item in v:
                    document.fields.append(Field(k, ensure_unicode(item)))
            else:
                document.fields.append(Field(k, ensure_unicode(v)))
        document.id = str(doc['id'])
        try:
            self.lock.acquire()
            self.dbconn.add(document)
        except errors.IndexerError as e:
            print str(e)
        finally:
            self.lock.release()
Example #3
0
def search(ctx):
    query = ctx.query.lower()

    ret = {}
    ret['ctx_update'] = {}
    srs = []

    r = requests.get(GOOGLE_SCHOLAR_URL.format(query))
    text = r.text.encode('utf-8')
    #with open('/tmp/b.html', 'r') as f:
        #text = f.read()

    def find_citecnt(dom):
        try:
            find = dom.findAll(attrs={'class': 'gs_ri'})[0]
            find = find.findAll(attrs={'class': 'gs_fl'})[0]
            find = find.findAll('a')[0].text
            cnt = re.search('[0-9]+', find).group()
            return int(cnt)
        except:
            return None


    soup = BeautifulSoup(text, BS_PARSER)
    results = soup.findAll(attrs={'class': 'gs_r'})
    title_updated = None
    for rst in results:
        try:
            h3 = rst.findAll('h3')[0]
            real_title = h3.get_text()
            real_title = filter_title_fileformat(real_title)
            tc = title_correct(query, real_title)
            if not tc[0]:
                continue
            if not title_updated and tc[1]:
                title_updated = ensure_unicode(title_beautify(real_title))
                while True:     # fix things like '[citation][c] Title'
                    new_title = re.sub('^\[[^\]]*\]', '', title_updated).strip()
                    if new_title == title_updated:
                        title_updated = new_title
                        break
                    title_updated = new_title
                log_info(u"Title updated: {0}".format(title_updated))
                ret['ctx_update']['title'] = title_updated

            cnt = find_citecnt(rst)
            if cnt is not None:
                ret['ctx_update']['citecnt'] = cnt

            try:
                url = str(h3.find('a').get('href'))
                srs.append(SearchResult(None, url))
            except:
                pass

            findpdf = rst.findAll(attrs={'class': 'gs_ggs'})
            if findpdf:
                pdflink = findpdf[0].find('a').get('href')
                url = str(pdflink)
                srs.append(SearchResult('directpdf', url))
        except Exception as e:
            log_exc("Search Item parse error: {0}".format(str(e)))
    ret['results'] = srs
    return ret
Example #4
0
 def update_new_title(self, title):
     if title != self.title:
         log_info("Using new title: {0}".format(ensure_unicode(title)))
         self.title = title
         return True
     return False
Example #5
0
def name_clean(name):
    p = re.compile('\(.*?\)', re.DOTALL)
    ret = p.sub('', name).strip()
    return ensure_unicode(ret)
Example #6
0
def name_clean(name):
    p = re.compile('\(.*?\)', re.DOTALL)
    ret = p.sub('', name).strip()
    return ensure_unicode(ret)