Esempio n. 1
0
def get_description(soup, feed):
    og_desc = get_og_property(soup, 'description')
    if og_desc:
        return og_desc
    tw_desc = get_twitter_property(soup, 'description')
    if tw_desc:
        return tw_desc
    meta_desc = get_meta_property(soup, 'description')
    if meta_desc:
        return meta_desc
    return clean_html(feed.description)
Esempio n. 2
0
def get_title(soup, feed):
    og_title = get_og_property(soup, 'title')
    if og_title:
        return og_title
    tw_title = get_twitter_property(soup, 'title')
    if tw_title:
        return tw_title
    meta_title = get_meta_property(soup, 'title')
    if meta_title:
        return meta_title
    return clean_html(feed.title)
Esempio n. 3
0
    def run_get_abstracts(self):
        self.updated = datetime.datetime.utcnow()

        if not self.abstract:
            r = requests.get("http://doi.org/{}".format(self.id))
            text = r.text
            if "</header>" in text:
                try:
                    text_after_header = text.split("</header", 1)[1]
                    text_after_p = text_after_header.split(
                        "																						<p>", 1)[1]
                    clean_text = clean_html(text_after_p)
                    # print clean_text[0:1000]
                    self.abstract = clean_text[0:1000]
                except IndexError:
                    pass
Esempio n. 4
0
    def article(self):
        """ Returns a dictionary with the title and paragraphs of the article """
        self.soup = BeautifulSoup(self.raw_text, self.html_parser)

        self._article()

        if len(self._title) == 0 or len(self._paragraphs) == 0:
            raise ArticleNotParsable()

        article = dict()

        article['title'] = clean_html(self._title[0])

        # clean html and remove blank paragaphs
        article['paragraphs'] = filter(bool, map(clean_html, self._paragraphs))

        return article
Esempio n. 5
0
 def clean_description(self):
     return util.clean_html(self.cleaned_data['description'])
Esempio n. 6
0
def getEntitiesAndEnrichSourcesSequential(sources, paramsDict):

    print('\ngetEntities Sequential():')

    #check/set defaults - start
    if ('addTitleClass' not in paramsDict):
        paramsDict['addTitleClass'] = False

    if ('addTopKTermsFlag' not in paramsDict):
        paramsDict['addTopKTermsFlag'] = 0

    if ('derefSleep' not in paramsDict):
        paramsDict['derefSleep'] = 0

    if ('debugFlag' not in paramsDict):
        paramsDict['debugFlag'] = False

    if ('cacheFlag' not in paramsDict):
        paramsDict['cacheFlag'] = False
    #check/set defaults - end

    for source, sourceDict in sources.items():

        if (paramsDict['debugFlag'] and paramsDict['cacheFlag']):
            html = derefURICache(sourceDict['link'])
        else:
            html = dereferenceURI(sourceDict['link'], paramsDict['derefSleep'])

        #set defaults - start
        setSourceDictDetails(sourceDict)
        #set defaults - end

        if (len(html) == 0):
            continue

        title = extractPageTitleFromHTML(html)
        text = clean_html(html)
        favicon = extractFavIconFromHTML(html, sourceDict['link'])

        if (len(text) == 0):
            continue

        entities2dList = getEntitiesFromText(text)

        #print('\n\ttitle:', title)
        #print('\tlink:', sourceDict['link'])
        #print('\tlen:', len(text.split(' ')), '\n')

        if (paramsDict['addTitleClass']):
            entities2dList = entities2dList + getTokenLabelsForText(
                title, 'TITLE')

        #add top addTopKTermsFlag terms - start
        if (paramsDict['addTopKTermsFlag'] > 0):
            topKTerms = getTopKTermsListFromText(
                text, paramsDict['addTopKTermsFlag'])

            allTerms = ''
            for termCountTup in topKTerms:
                if (len(termCountTup) != 0):
                    allTerms += termCountTup[0] + ' '

            entities2dList = entities2dList + getTokenLabelsForText(
                allTerms, 'TOP' + str(paramsDict['addTopKTermsFlag']) + 'TERM')
        #add top addTopKTermsFlag terms - end

        text = sanitizeText(text)
        sourceDict['text'] = text
        sourceDict['title'] = title
        sourceDict['favicon'] = favicon
        sourceDict['extraction-time'] = datetime.now().isoformat()
        sourceDict['entities'] = addDetailsToEntities(entities2dList)

    return sources
Esempio n. 7
0
def getEntitiesAndEnrichSources(sources, paramsDict):
    #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY
    #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY
    #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY
    #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY
    #NOTE getEntitiesAndEnrichSourcesSequential DUPLICATES FUNCTIONALITY FOR SIMPLICITY
    print('\ngetEntities()')

    #check/set defaults - start
    if ('addTitleClass' not in paramsDict):
        paramsDict['addTitleClass'] = False

    if ('addTopKTermsFlag' not in paramsDict):
        paramsDict['addTopKTermsFlag'] = 0

    if ('derefSleep' not in paramsDict):
        paramsDict['derefSleep'] = 0

    if ('threadPoolCount' not in paramsDict):
        paramsDict['threadPoolCount'] = 5

    if ('debugFlag' not in paramsDict):
        paramsDict['debugFlag'] = False

    if ('cacheFlag' not in paramsDict):
        paramsDict['cacheFlag'] = False
    #check/set defaults - end

    if (paramsDict['threadPoolCount'] == 0):
        return getEntitiesAndEnrichSourcesSequential(sources, paramsDict)

    print('\tthreadPoolCount:', paramsDict['threadPoolCount'])

    textColToLabel = []
    listOfEntities2dList = []

    count = 1
    total = len(sources)
    nerVersion = ''
    for source, sourceDict in sources.items():

        if (paramsDict['debugFlag'] and paramsDict['cacheFlag']):
            html = derefURICache(sourceDict['link'])
        else:
            html = dereferenceURI(sourceDict['link'], paramsDict['derefSleep'])

        #set defaults - start
        setSourceDictDetails(sourceDict)
        #set defaults - end

        print('\tsource:', source)
        print('\t', count, 'of', total)
        count += 1

        if (html == ''):
            continue

        title = extractPageTitleFromHTML(html)
        text = clean_html(html)
        text = sanitizeText(text)
        favicon = extractFavIconFromHTML(html, sourceDict['link'])

        print('\thtml.len:', len(html))
        print('\ttext.len:', len(text))
        print()
        if (text == ''):
            continue

        sourceDict['title'] = title
        sourceDict['text'] = text
        sourceDict['favicon'] = favicon

        textColToLabel.append({
            'textToLabel': text,
            'id': source,
            'published': sourceDict['published']
        })

    try:
        workers = Pool(paramsDict['threadPoolCount'])
        serverOn = nlpIsServerOn(args.nlp_server_host)

        if (serverOn):
            print('\tNER version: 3.8.0')
            listOfEntities2dList = workers.map(parallelNERNew, textColToLabel)
            nerVersion = '3.8.0'
        else:
            print('\tNER version: old')
            #use old ner version since new server was not able to be started
            listOfEntities2dList = workers.map(parallelNER, textColToLabel)
            nerVersion = 'old'

        workers.close()
        workers.join()
    except:
        localErrorHandler()
        return sources

    for entitiesDetailsDict in listOfEntities2dList:

        source = entitiesDetailsDict['id']
        sources[source]['entities'] = entitiesDetailsDict['entities2dList']

        if (paramsDict['addTitleClass']):
            sources[source]['entities'] += getTokenLabelsForText(
                sources[source]['title'], 'TITLE')

        #add top addTopKTermsFlag terms - start
        if (paramsDict['addTopKTermsFlag'] > 0):
            topKTerms = getTopKTermsListFromText(
                sources[source]['text'], paramsDict['addTopKTermsFlag'])

            allTerms = ''
            for termCountTup in topKTerms:
                if (len(termCountTup) != 0):
                    allTerms += termCountTup[0] + ' '

            sources[source]['entities'] += getTokenLabelsForText(
                allTerms, 'TOP' + str(paramsDict['addTopKTermsFlag']) + 'TERM')
        #add top addTopKTermsFlag terms - end

        #clear some fields
        sources[source]['extraction-time'] = datetime.now().isoformat()
        sources[source]['entities'] = addDetailsToEntities(
            sources[source]['entities'])

    return sources, nerVersion
Esempio n. 8
0
 def clean_content(self):
     return util.clean_html(self.cleaned_data["content"])