コード例 #1
0
ファイル: miner.py プロジェクト: lidsky/alienknows
def get_preview_website(url, target_test, title, response):

    #TODO: REFACTORRR!!!!!!
    #TODO: check content-type, if video or article, do not run summarizer

    if get_domain(url) in SKIP_DOMAIN_SUMMARY:
        return ''
    if target_test:
        sentence_array = []
        try:
            sentence_array = pyteaser.SummarizeUrl(url)
        except ZeroDivisionError:
            print ZeroDivisionError
            print 'in get_preview_website, problem with pyteaser.SummarizeUrl, submission.url: ', url
            pass
        except:
            print 'in get_preview_website, problem with pyteaser.SummarizeUrl, submission.url: ', url
        if not sentence_array and title and response and response.ok and 'text/html' in response.headers['content-type']:
            body_text = extract_article(response.text)
            try:
                sentence_array = pyteaser.Summarize(title, body_text)
            except ZeroDivisionError:
                print 'in get_preview_website, problem with pyteaser.Summarize, submission.url: ', url
                pass
        if sentence_array:
            valid_summary_debugger(sentence_array, target_test)
            if valid_summary(sentence_array, target_test):
                return join_sentence_array(sentence_array)                   
    return ''
コード例 #2
0
def grab(location, keywords, publication, publication_date, title):
    goose = Goose()
    try:
        raw_article = goose.extract(url=location)
        description = raw_article.meta_description.encode("utf8")
        article = raw_article.cleaned_text.encode("utf8")
        split_keywords = keywords.split(',')

        summary = pyteaser.SummarizeUrl(location)
        output = json.dumps({
            "title": title,
            "keywords": split_keywords,
            "publication": publication,
            "publication_date": publication_date,
            "description": description,
            "source": location,
            "article": article,
            "summary": summary
        })
        logging.warning('Succesfully grabbed through Goose.')
        logging.warning('Location: %s, Publication: %s' %
                        (location, publication))
        return output
    except:
        logging.critical('Unable to get article through Goose.')
        logging.critical('Location: %s, Publication: %s' %
                         (location, publication))
        return None
コード例 #3
0
def main():
    submissions = getSubmissions()
    done = getDone()
    counts = 0  #how many comments made this round

    for submission in submissions:
        if counts >= comments_per_run:
            break
        id = submission.id
        point = submission.ups - submission.downs

        if id not in done and point < thresh_max and point > thresh_min:
            putDone(submission.id)
            sentences = pyteaser.SummarizeUrl(submission.url)
            if (sentences != None):
                counts += 1
                comment = formComment(sentences, submission)

            submission.add_comment(comment)
            print(comment)
コード例 #4
0
ファイル: library.py プロジェクト: cybertrust1/textflows
def streaming_summarize_url(input_dict, widget, stream=None):
    import pyteaser
    summaries = pyteaser.SummarizeUrl(input_dict['url'])
    output_dict = {}
    output_dict['summary'] = " ".join(summaries)
    return output_dict
コード例 #5
0
 def __fetchUrl(self, url):
     # run() should catch the exceptions and handle them
     self.logger.debug('%s fetching page: %s' % (self.name, url))
     res = pyteaser.SummarizeUrl(
         url)  # TODO: use GENERIC_HEADERS and TIMEOUT
     return " ".join(res)