Esempio n. 1
0
def refresh():
    articles = [article()]
    try:
        with open('articles.json') as f:
            articles += [article(_) for _ in json.load(f)]
    except FileNotFoundError:
        pass
    except json.decoder.JSONDecodeError:
        os.remove('articles.json')
    with open('articles.json', 'w+') as f:
        json.dump([a.__dict__ for a in articles], f)
    return redirect('/', code=303)
Esempio n. 2
0
 def MPsentiment(self, i):
     filename = self.path+"/"+self.files[i]
     print(filename)
     a = article(filename, self.city, self.files[i])
     self.articles.append(a)
     if a.data == None:
         a.calculateWords()
Esempio n. 3
0
    def read_json(self, input_file, quotechar=None):
        lists = []
        with codecs.open(input_file, "r", "utf-8") as fd:
            rc = json.loads(fd.read())
            for item in rc:
                artical_ins = article()
                label = "None" if "label" not in item.keys() else str(
                    item["label"])
                title = "None" if "title" not in item.keys() else item["title"]
                content = "None" if "content" not in item.keys(
                ) else item["content"]
                ctime = "None" if "ctime" not in item.keys() else str(
                    item["ctime"])
                url = "None" if "url" not in item.keys() else item["url"]
                uuid = "None" if "uuid" not in item.keys() else item["uuid"]
                artical_ins.label = label.replace("\n", "").replace(
                    "\r", "").replace("\t", "")
                artical_ins.title = title.replace("\n", "").replace(
                    "\r", "").replace("\t", "")
                artical_ins.content = content.replace("\n", "").replace(
                    "\r", "").replace("\t", "")
                artical_ins.ctime = ctime.replace("\n", "").replace(
                    "\r", "").replace("\t", "")
                artical_ins.url = url.replace("\n", "").replace("\r",
                                                                "").replace(
                                                                    "\t", "")
                artical_ins.uuid = uuid.replace("\n", "").replace("\r",
                                                                  "").replace(
                                                                      "\t", "")

                lists.append(artical_ins)
        lists = list(self._dedupe(lists, key=lambda a: a.title))
        return lists
Esempio n. 4
0
def test_profile_author():
    url = "https://www.nytimes.com/2020/02/17/world/asia/coronavirus-westerdam-cambodia-hun-sen.html?action=click&module=Top%20Stories&pgtype=Homepage"
    test_article = article(url)
    article_result = test_article.get()
    test_author = authorCard(article_result["article_id"], article_result["author_page_link"],
                             article_result["profile"], article_result["author_name"])
    print(test_author.get())
def main():
    f1 = open('plots.txt', 'r')
    f2 = open('titles.txt', 'r')

    f_plots = f1.read()
    f_titles = f2.read()

    plots = f_plots.split("<EOS>")
    titles = f_titles.split("\n")

    articles = []
    for i in range(len(titles)):
        articles.append(article(titles[i], plots[i]))

    csvData = []
    csvData.append(["Title", "NW_Title", "ND_Title", "NW_Plot", "ND_Plot"])
    for i in range(len(titles)):
        csvData.append([
            articles[i].title, articles[i].wordsTitle, articles[i].digitsTitle,
            articles[i].wordsPlot, articles[i].digitsPlot
        ])
    csv.register_dialect('mD', quoting=csv.QUOTE_ALL, skipinitialspace=True)

    with open('dataset2.csv', 'w') as csvFile:
        writer = csv.writer(csvFile, dialect='mD')
        writer.writerows(csvData)
    csvFile.close()
Esempio n. 6
0
def nyt_scraper(my_url):
    '''
    Returns a list of article objects from the scraped BBC news URL
    You can chose different URLs for BBC RSS to serve up 
    '''
    # my_url =  # URL of rss feed / whatever you need

    uClient = urllib.urlopen(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html,'xml')

    article_list = [] 

    for item in page_soup.findAll('item'): # For each RSS article  
        title = item.title.text.encode("utf-8")
        description = item.description.text.encode("utf-8")
        date = str(item.pubDate.text)
        link = item.link.text
        
        uClient = urllib.urlopen(link)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "html.parser")
        stories = page_soup.findAll("p")
        body = ""

        for bod in stories: # Gets the strings from the paragraphs 
            body = body + bod.text
        if len(body) > 0:
            temp = article(title,description,date,link,body)
            article_list.append(temp)

    
    return article_list
Esempio n. 7
0
File: app.py Progetto: townboy/blog
def func_article(artname):
    instance = handleSta.handleSta()
    instance.read()
    instance.setValue(artname, instance.getValue(artname, 0) + 1)
    instance.setValue('pv', instance.getValue('pv', 0) + 1)
    instance.write()
    
    read = article.article('./article/' + artname)
    return read.read()
Esempio n. 8
0
def articles(guid):
    try:
        with open('articles.json') as f:
            articles = [article(_) for _ in json.load(f)]
            for art in articles:
                if (art.guid == guid):
                    return render_template('article.html', article=art)
    except FileNotFoundError:
        return redirect('refresh', code=303)
    return redirect('/', code=303)
Esempio n. 9
0
def index(*args):
    articles = []
    try:
        with open('articles.json') as f:
            try:
                articles = [article(_) for _ in json.load(f)]
            except json.decoder.JSONDecodeError:
                os.remove('articles.json')
                return redirect('refresh', code=303)
    except FileNotFoundError:
        return redirect('refresh', code=303)
    return render_template('main.html', lines=articles)
Esempio n. 10
0
    def get(self):
        '''
        The main function in citationsNetwork to check if publisher information exists in citations table
        And use Article class to obtain information for each citation link
        :return: a json dictionary that contains:
                'article_paragraphs':  []
                'citation_links':   []
                'citation_info': {'link': {'article_title', 'article_content', 'article_credibility'}}
        :return None if article is not in database
        '''

        json_dict = dict()

        # extract information from database
        citation_results = self.db.lookup_citation(self.article_id)
        if citation_results is not None:
            article_paragraphs = citation_results[0]
            citation_links = citation_results[1]

            # obtain specific info for each citation link
            ## TODO: Advance parallel processing
            ## NOW: limit three citations only (for convenience)
            citation_info = dict()
            count = 0
            for i in range(len(citation_links)):
                one_info = dict()
                # skip non-existed citations
                if citation_links[i] == "None" or count >= 3:
                    citation_links[i] = "None"
                    continue
                cited_article = article(citation_links[i])
                article_result = cited_article.get()
                if article_result is not None:
                    # create inner dict to store information for one citation
                    one_info['article_title'] = article_result['article_title']
                    one_info['article_content'] = article_result[
                        'article_content']
                    one_info['article_credibility'] = article_result[
                        'article_reliability']
                    citation_info[citation_links[i]] = one_info
                    count += 1
                else:
                    # delete non-profile citation link
                    citation_links[i] = "None"

            json_dict['article_paragraphs'] = article_paragraphs
            json_dict['citation_links'] = citation_links
            json_dict['citation_info'] = citation_info

            return json_dict
        else:
            return None
Esempio n. 11
0
 def polarity():
     articles= []
     #store the corresponding percentage
     percent= []
     polarity=0
     i =0
     for x in files:
         filename = path+"/"+files[i]
         articles.append(article(filename))
         articles[i].calculateWords()
         percent[i] = articles[i].formula()
         i +=1
     for j in percent:
         polarity += percent[j]
Esempio n. 12
0
def quantify(word, number=0):
    
    """ Returns a phrase describing the number of given objects.
    
    Two objects are described as being a pair,
    smaller than eight is several,
    smaller than twenty is a number of,
    smaller than two hundred are dozens,
    anything bigger is described as being
    tens or hundreds of thousands or millions.
    
    For example:
    chicken, 100 -> dozens of chickens 
    
    """
    
    def _plural(word):
        return plural(word, custom=quantify_custom_plurals)
    
    if number == 0:
        return "no " + _plural(word)
    if number == 1:
        return article(word)
    if number == 2:
        return "a pair of " + _plural(word)
    if number in range(3,8):
        return "several " + _plural(word)
    if number in range(8,20):
        return "a number of " + _plural(word)
    if number in range(20,200):
        return "dozens of " + _plural(word)
        
    if number >= 200:
        
        thousands = int( log(number, 10) / 3 )
        subthousands = int( log(number, 10) % 3 )
        
        if subthousands == 2:
            stword = "hundreds of "
        elif subthousands == 1:
            stword = "tens of "
        else:
            stword = ""
        if thousands > 0:
            thword = _plural(numeral_thousands(thousands-1)) + " of "
        else:
            thword = ""
            
        return stword + thword + _plural(word)
Esempio n. 13
0
def quantify(word, number=0):
    """ Returns a phrase describing the number of given objects.
    
    Two objects are described as being a pair,
    smaller than eight is several,
    smaller than twenty is a number of,
    smaller than two hundred are dozens,
    anything bigger is described as being
    tens or hundreds of thousands or millions.
    
    For example:
    chicken, 100 -> dozens of chickens 
    
    """
    def _plural(word):
        return plural(word, custom=quantify_custom_plurals)

    if number == 0:
        return "no " + _plural(word)
    if number == 1:
        return article(word)
    if number == 2:
        return "a pair of " + _plural(word)
    if number in range(3, 8):
        return "several " + _plural(word)
    if number in range(8, 20):
        return "a number of " + _plural(word)
    if number in range(20, 200):
        return "dozens of " + _plural(word)

    if number >= 200:

        thousands = int(log(number, 10) / 3)
        subthousands = int(log(number, 10) % 3)

        if subthousands == 2:
            stword = "hundreds of "
        elif subthousands == 1:
            stword = "tens of "
        else:
            stword = ""
        if thousands > 0:
            thword = _plural(numeral_thousands(thousands - 1)) + " of "
        else:
            thword = ""

        return stword + thword + _plural(word)
Esempio n. 14
0
def article(request):
    print "Running tests"
    description = request.GET.get('description')
    if description == None:
        return render_to_response('project/article.html',
                                  {'description': description})

    queryArticle = a.article(body=description)
    facts = extraction.extractFacts(queryArticle)
    reports = []
    for fact in facts:
        reports.append(filter_f.filter_facts(fact, facts_db))

    return render_to_response('project/article.html', {
        'description': description,
        'reports': reports
    })
Esempio n. 15
0
File: app.py Progetto: townboy/blog
def index():
    art = os.listdir('./article')
    give = []

    instance = handleSta.handleSta()
    instance.read()
    
    for item in art:
        path = './article/' + item
        read = article.article(path)
        instance.setValue(item, instance.getValue(item, 0) )
        give.append([item, read.read_title() + ' (' + read.read_time() + u'浏览  ' + str(instance.getValue(item, 0)) + ')'])
    give = sorted(give)[::-1]
    getIp = ip.ip().getIpInfo()

    instance.setValue('pv', instance.getValue('pv', 0) + 1)
    
    instance.write()
    return jinja2_template('templates/home.html', domain = settings.domain, users = give, ipInfo = getIp, pv = instance.getValue('pv', 0))
def main():
    personas = generatePersonas()
    #
    for subdir, dirs, files in os.walk("../dataset/articles/"):
        for file in files:
            filepath = subdir + os.sep + file
            print (file)
            if file != ".DS_Store":
                with codecs.open(filepath, "r",encoding='utf-8') as myfile:
                    wholeText = myfile.read()
                    print wholeText
                    for word in wholeText:
                        if not isAscii(word):
                            wholeText.replace(word, '')
                    arcl = article.article(wholeText, filepath)
                    for person in personas:
                        val = person.personaConsumer(arcl.tagList)
                        if val == 1:
                            person.articles.append(arcl)
    savePersonas(personas)
Esempio n. 17
0
def ap_scraper(my_url):
    '''
    returns a list of article objects from the scraped AP news URL
    you can chose different URLs for Associated Press RSS to serve up 
    '''

    # my_url = "http://feeds.bbci.co.uk/news/rss.xml" # URL of rss feed / whatever you need

    uClient = urllib.urlopen(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, 'xml')

    titles = []
    descriptions = []
    dates = []
    links = []
    for item in page_soup.findAll(
            'item'):  # I can make this better space efficiency and time
        titles.append(item.title.text.encode("utf-8"))
        descriptions.append(item.description.text.encode("utf-8"))
        dates.append(str(item.pubDate.text))
        links.append(item.title.text)
    l = [titles, descriptions, dates, links]

    article_list = []
    for i in range(len(l[0])):
        temp = article(l[0][i], l[1][i], l[2][i], l[3][i])
        article_list.append(temp)

    return article_list

    # print l[0][i]
    # print l[1][i]
    # print l[2][i]
    # print


# for thing in article_list:
#     print thing
#     print
Esempio n. 18
0
    def process_author_credibility(self, author_article_list):
        '''
        helper function for get
        use scrapy crawler to store author credibility information
        and NELA tool to generate credibility scores

        :param author_article_list, a list of article link from this author
        '''
        # use at most recently 5 article to generate reliability and bias score for author
        accumlated_reliability = list()
        accumlated_bias = list()
        for article_link in author_article_list:
            # get article credibility to process
            each_article = article(article_link)
            article_result = each_article.get()
            print(article_link)

            if article_result is None:
                continue

            if article_result["article_reliability"] >= 0:
                accumlated_reliability.append(
                    article_result["article_reliability"])
                accumlated_bias.append(article_result["article_bias"])

            if len(accumlated_reliability) >= 5:
                break

        if len(accumlated_reliability) != 0:

            avg_reliability = self.sum_list_scores(accumlated_reliability)
            avg_bias = self.sum_list_scores(accumlated_bias)
        else:
            # -1 represents Not available reliability/bias
            avg_reliability = -100
            avg_bias = -100
        self.db.insert_author_credibility(self.author_id, avg_reliability,
                                          avg_bias)
Esempio n. 19
0
def wpo_scraper(my_url):
    '''
    Returns a list of article objects from the scraped BBC news URL
    You can chose different URLs for BBC RSS to serve up 
    '''
    # my_url = "http://feeds.washingtonpost.com/rss/rss_election-2012"
    # URL of rss feed / whatever you need

    uClient = urllib.urlopen(my_url)
    page_html = uClient.read()
    uClient.close()
    page_soup = soup(page_html, 'xml')

    article_list = []

    for item in page_soup.findAll('item'):  # For each RSS article
        title = item.title.text.encode("utf-8")
        description = item.description.text.encode("utf-8")
        date = str(item.pubDate.text)
        link = item.link.text
        print title
        print description
        uClient = urllib.urlopen(link)
        page_html = uClient.read()
        uClient.close()
        page_soup = soup(page_html, "xml")
        stories = page_soup.findAll("div", {"class": "article-body"})
        body = ""

        for bod in stories:  # Gets the strings from the paragraphs
            paragraphs = bod.findAll("p")
            for pars in paragraphs:
                body = body + pars.text
        if len(body) > 0:
            temp = article(title, description, date, link, body)
            article_list.append(temp)

    return article_list
Esempio n. 20
0
def storeStories():
    #print("Updating articles.")
    news.drop()
    stories = []
    year = datetime.now().year
    month = datetime.now().month
    NYT = requests.get("https://api.nytimes.com/svc/archive/v1/" + str(year) +
                       "/" + str(month) + ".json?api-key=" +
                       os.environ.get('NYT_API')).json()

    for story in NYT["response"]["docs"][-1 * min(len(NYT["response"]["docs"]
                                                      ), 100):]:
        art = getNYTArticle(story['web_url'])
        subjectivity, grade, sentimentality = subjectivity_and_grade(art)
        datetime_obj = datetime.strptime(story["pub_date"],
                                         '%Y-%m-%dT%H:%M:%S+%f')
        date_string = str(datetime_obj.month) + "/" + str(
            datetime_obj.day) + "/" + str(datetime_obj.year)
        a = article(int(grade), story["web_url"], "NYT",
                    story["headline"]["main"], story["lead_paragraph"],
                    date_string, subjectivity, sentimentality)
        stories.append(vars(a))

    news.insert_many(stories)
Esempio n. 21
0
def exportsinglearticle(url, silent=False):
    '''
    Bind method to put article from a url to a txt file
    Variable type: url - String
    '''
    url = url_decorator(url)
    article_wanted = article(url)
    title = article_wanted.gettitle()
    author = article_wanted.getauthor()
    content = article_wanted.getcontent()
    summary = article_wanted.getsummary()
    notes = article_wanted.getnotes()
    chapter = article_wanted.getchap()
    related_chaps = article_wanted.get_related_chaps()
    if silent == False:
        print('Exporting ' + title)
    write_totxt('./article',
                title=title,
                author=author,
                content=content,
                chapter=chapter,
                summary=summary,
                notes=notes)
    return related_chaps
Esempio n. 22
0
            if(day < 10):
                URL = year + month + '0' + str(day) + '.html'
            else:
                URL = year + month + str(day) + '.html'
            
            page = requests.get(URL)
            tree = html.fromstring(page.text)
            URLs = tree.xpath('//div[@class="headlineMed"]/a/@href')
            date = URL[-13:-5]

            f = open('output/' + str(date) + '.txt', 'w')
            # generate the random vector(python generate a sample without 
            # replacement from a range of numbers)
             
            for num in random.sample(range(0, len(URLs)), int(len(URLs))):
                doc = ac.article('', date, '', URLs[num], -1)
                curpage = requests.get(doc.URL)
                curtree = html.fromstring(curpage.text)
                Title = curtree.xpath('//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()')
                Paragraphs = curtree.xpath('//*[@id="articleText"]/p/text()')
                if len(Title) > 0:
                    doc.Title = Title[0].replace('\"', '')
                    Paragraphs.append(Title[0])
                doc.Text = " ".join(Paragraphs)
                doc.Text = doc.Text.replace('\n', ' ')
                doc.Text = doc.Text.replace('\"', '')

                if(len(doc.Text.split()) > 100):
                    docId = docId + 1
                    doc.id = docId
                    print doc.id
Esempio n. 23
0
            }
        }).sort([('mostComplete', 1)]).limit(5)
        recents = db.table.find().sort([('recentlyUpdated', -1)]).limit(5)

        complete_articles = formatter['article_list'](complete)
        recent_articles = formatter['article_list'](recents)
        all_articles = formatter['article_list'](articles)
        """
		#get the most updated articles
		for article in articles:
			title = article['title']
			author = formatter['authorIDs'](article['authorIDs'])
			year = article['year']
			url = "http://%s?doi=%s" % (article_url, article['doi'])
			output += "<a href='%s'>%s, %s, %s</a><br/>" % (url, title, author, year)
		"""

        output = index_template.render_unicode(recents=recent_articles,
                                               completes=complete_articles,
                                               all_articles=all_articles)

        return output


if __name__ == '__main__':
    cherrypy.config.update({'server.socket_host': url})
    cherrypy.config.update({'server.socket_port': index_port})
    index = index()
    index.article = article.article()
    cherrypy.quickstart(index)
Esempio n. 24
0
def getchapurls(url):
    articls_instance = article(url)
    return articls_instance.get_related_chaps()
Esempio n. 25
0
def test_profile_article():
    url = "https://www.nytimes.com/2020/02/17/world/asia/coronavirus-westerdam-cambodia-hun-sen.html?action=click&module=Top%20Stories&pgtype=Homepage"
    test_article = article(url)
    result = test_article.get()
    print(result)
def set_the_article_file_location(step):
	world.article = article()
	world.article.set_file_location(test_xml_path, world.document)
            # print 'Input format is "', input
            # print 'Output format is "', output
            # print 'DOI is "', doi

            # Check for minimum parameters - need input and output
    if (
        (input == None and output == None)
        or (input == "fluidinfo" and doi == None)
        or (input == "fluidinfo" and doi != None and output == None)
    ):
        print >> sys.stderr, "Insufficient parameters supplied"
        print >> sys.stderr, "for help use --help"
        return 2

    if input == "fluidinfo" and doi != None:
        a = article.article(doi)
        a.load_from_fi()
        if output == "article":
            # Test output
            print json.dumps(a.data(), sort_keys=True, indent=4)

        elif output == "fluidinfo":
            # In from fluidinfo, out to fluidinfo (not recommended except for testing purposes)
            load_article_into_fi(a)

    elif input != "fluidinfo":
        # Presumably a document name provided
        a = load_article(input)
        if output == "article":
            # Test output
            print json.dumps(a.data(), sort_keys=True, indent=4)
Esempio n. 28
0
		complete = db.table.find({'mostComplete' : {'$gte' : 1}}).sort([('mostComplete', 1)]).limit(5)
		recents = db.table.find().sort([('recentlyUpdated', -1)]).limit(5)

		complete_articles = formatter['article_list'](complete)
		recent_articles = formatter['article_list'](recents)
		all_articles = formatter['article_list'](articles)
		

		"""
		#get the most updated articles
		for article in articles:
			title = article['title']
			author = formatter['authorIDs'](article['authorIDs'])
			year = article['year']
			url = "http://%s?doi=%s" % (article_url, article['doi'])
			output += "<a href='%s'>%s, %s, %s</a><br/>" % (url, title, author, year)
		"""

		output = index_template.render_unicode(recents = recent_articles, completes = complete_articles, all_articles=all_articles)		


		return output 

if __name__ == '__main__':
	cherrypy.config.update({'server.socket_host':url})
	cherrypy.config.update({'server.socket_port':index_port})
	index = index()
	index.article = article.article()
	cherrypy.quickstart(index)

Esempio n. 29
0
    def process_URL(self):
        '''
        Set up article class to process and store information in self.article_content
        set up author card class to process and store in self.author_card

        :return: set self.URL_status to True if the article is crawled successfully
                    Otherwise, set self.URL_status to False
        '''

        # set up article class
        new_article = article(self.URL)
        article_result = new_article.get()
        if article_result is not None:
            self.URL_status = True
            # store article information to self.article_content
            self.article_content = dict()
            self.article_content["article_title"] = article_result[
                "article_title"]
            self.article_content["article_content"] = article_result[
                "article_content"]
            self.article_content["author_name"] = article_result["author_name"]
            self.article_content["publisher_name"] = article_result[
                "publisher_name"]
            self.article_content["article_reliability"] = article_result[
                "article_reliability"]
            self.article_content["article_bias"] = article_result[
                "article_bias"]

            # set up author card class
            new_author_card = authorCard(
                article_id=article_result["article_id"],
                author_page_link=article_result["author_page_link"],
                profile=article_result["profile"],
                author_name=article_result["author_name"])
            author_result = new_author_card.get()

            # store author information to self.author_card
            if author_result is not None:
                self.author_card = dict()
                self.author_card["author_name"] = author_result["author_name"]
                self.author_card["author_introduction"] = author_result[
                    "author_intro"]
                self.author_card["author_reliability_score"] = author_result[
                    "author_reliability"]
                self.author_card["author_bias_score"] = author_result[
                    "author_bias"]
                self.author_card["author_link"] = author_result["author_link"]

            else:
                self.author_card = None

            #set up publisher card class
            new_publisher_card = publisherCard(article_result["profile"])

            publisher_result = new_publisher_card.get()

            # store publisher information to self.publisher_card
            if publisher_result is not None:
                self.publisher_card = dict()
                self.publisher_card["publisher_name"] = publisher_result[
                    "publisher_name"]
                self.publisher_card[
                    "publisher_introduction"] = publisher_result[
                        "publisher_intro"]
                self.publisher_card[
                    "publisher_reliability_score"] = publisher_result[
                        "publisher_reliability_score"]
                self.publisher_card["publisher_link"] = publisher_result[
                    "publisher_link"]

            # set up citation network class
            new_citation_network = citationsNetwork(
                article_result["article_id"])
            citation_network_result = new_citation_network.get()

            if citation_network_result is not None:
                # store partial information in article_content
                self.article_content[
                    "article_paragraphs"] = citation_network_result[
                        "article_paragraphs"]
                self.article_content[
                    "citation_links"] = citation_network_result[
                        "citation_links"]

                # store specific citation info to self.citation_network
                self.citation_network = citation_network_result[
                    "citation_info"]
            else:
                self.citation_network = None

        else:
            self.article_content = dict()
            self.URL_status = False
Esempio n. 30
0
 def article(self, word):
     return article.article(word)
def load_article(document=None):
    # Build an article object from XML file
    path = None
    a = article.article()
    a.parse_document(path, document)
    return a
Esempio n. 32
0
def testCall():
    testArticle = article(title, description, date, link, body)
    print extractFacts(testArticle)
Esempio n. 33
0
File: app.py Progetto: townboy/blog
def aboutme():
    read = article.article('./special/aboutme.md')
    return read.read()
Esempio n. 34
0
            if (day < 10):
                URL = year + month + '0' + str(day) + '.html'
            else:
                URL = year + month + str(day) + '.html'

            page = requests.get(URL)
            tree = html.fromstring(page.text)
            URLs = tree.xpath('//div[@class="headlineMed"]/a/@href')
            date = URL[-13:-5]

            f = open('output/' + str(date) + '.txt', 'w')
            # generate the random vector(python generate a sample without
            # replacement from a range of numbers)

            for num in random.sample(range(0, len(URLs)), int(len(URLs))):
                doc = ac.article('', date, '', URLs[num], -1)
                curpage = requests.get(doc.URL)
                curtree = html.fromstring(curpage.text)
                Title = curtree.xpath(
                    '//*[@id="content"]/div[4]/div/div[3]/div[1]/h1/text()')
                Paragraphs = curtree.xpath('//*[@id="articleText"]/p/text()')
                if len(Title) > 0:
                    doc.Title = Title[0].replace('\"', '')
                    Paragraphs.append(Title[0])
                doc.Text = " ".join(Paragraphs)
                doc.Text = doc.Text.replace('\n', ' ')
                doc.Text = doc.Text.replace('\"', '')

                if (len(doc.Text.split()) > 100):
                    docId = docId + 1
                    doc.id = docId
Esempio n. 35
0
 def query(self, title):
     self.cursor.execute(self.query_by_title_sql, (title, ))
     data = self.cursor.fetchall()[0]
     art = article(data[1], data[2], data[3], data[4])
     return art.__dict__