Example #1
0
def create_all_tables(flotte_db):
    mycursor = flotte_db.cursor()
    Article.create_Article_tb(mycursor)
    Commande.create_Commande_tb(mycursor)
    Table.create_Table_tb(mycursor)
    Type.create_Type_tb(mycursor)
    Robot.create_Robot_tb(mycursor)
    Positions.create_Pose_tb(mycursor)
    def __init__(self, url):
        """Create a LinkDownloader.

        Arguments:
        url -- The URL of the article to download.
        """
        self.url = url
        self.article = Article(url)
class ArticleDownloader(object):
    def __init__(self, url):
        """Create a LinkDownloader.

        Arguments:
        url -- The URL of the article to download.
        """
        self.url = url
        self.article = Article(url)

    def download_article(self):
        self.article.download_and_parse()
Example #4
0
    def parse_article(filename):
        title, text = XmlParser.parse_article(filename)
        intro, unnamed_sections, named_sections = TextParser.parse_text(text)

        uncited_intro, citations = ContentParser.parse_content(intro.content)

        section_metas = {}
        uncited_sections = []
        for section in unnamed_sections:

            # Map section level to other section meta information
            section_meta = section.meta
            section_meta_level = section_meta.level
            if not section_meta_level in section_metas:
                section_metas[section_meta_level] = []
            section_metas[section_meta_level].append(section_meta)

            # Split section content into citations and actual content
            uncited_section, section_citations = ContentParser.parse_content(
                section.content)
            uncited_sections.append(uncited_section)
            citations = citations + section_citations

        return Article.Article(title, intro.meta, uncited_intro, section_metas,
                               uncited_sections, named_sections, citations)
Example #5
0
    def __init__(self, path):

        self.name = os.path.basename(path)
        self.path = path
        self.stopWords = []
        self.sources = []
        self.articles = []

        #Loading language from file
        langFile = open(path, "r")

        line = langFile.readline().rstrip()

        if (line == "Stopwords{"):
            line = langFile.readline().rstrip()
            while (line != "}"):
                self.stopWords.append(line)  #Creating stopwords list
                line = langFile.readline().rstrip()
        line = langFile.readline().rstrip()
        if (line == "Sources{"):
            line = langFile.readline().rstrip()
            while (line != "}"):
                self.sources.append(line)  #Creating sources list
                line = langFile.readline().rstrip()
        line = langFile.readline().rstrip()
        if (line == "Articles{"):
            line = langFile.readline().rstrip()
            while (line != "}"):
                self.articles.append(
                    Article.Article(line, self.stopWords,
                                    True))  #Creating articles list
                line = langFile.readline().rstrip()

        langFile.close()
Example #6
0
def print_mp_msg(msg):
    print("MP")
    print(msg)
    print msg.raw

    try:
        if msg.type == SHARING:
            dict_msg = msg.raw  # 获取推送文章信息 dict
            article_name = msg.chat.name  # 获取公众号名称
            article_title = msg.text  # 文章标题
            sharing_rawurl = dict_msg['Url']
            times = time.localtime(dict_msg['CreateTime'])  # 发言时间戳
            dt = "%s%02d%s" % (times[0], int(times[1]), times[2])  # 格式化时间

            article = Article.Article()
            article.article_name = article_name
            article.article_title = article_title
            article.last_edit_time = dt

            print "raw_url:", sharing_rawurl

            article.article_url = sharing_rawurl
            result = WxArticle.parse_html(sharing_rawurl)
            if isinstance(result, dict):
                article.read = result['reads']
                article.like = result['likes']
                articleDao.addArticle(article)
            else:
                article.article_url = ""
                articleDao.addArticle(article)
            print result
    except Exception as e:
        print e.message
Example #7
0
def create_maindb(dbId, dbBaseDir):
    """Look for any subdirs (arbitrary level under dbBaseDir) that start with '#'.
    From these subdirs, try to load ./fulltext.txt and ./meta.xml files, create an article and insert it into database. 
    Store the database persistently as PyDbLite (pickle) file 'gensim_dbId.pdl'.
    """
    dbFname = common.dbFile('gensim', dbId)
    logging.info("opening database %s" % dbFname)
    db = ArticleDB.ArticleDB(dbFname, mode='override', autocommit=False)

    proc_total = 0
    logging.info("processing database %s, directory %s" % (dbId, dbBaseDir))
    for root, dirs, files in os.walk(dbBaseDir):
        root = os.path.normpath(root)
        if os.path.basename(root).startswith('#'):
            proc_total += 1
            try:
                meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml'))
                #meta = {'msc' : []}
                #meta['id_int'] = Article.idFromDir(root)
                meta['id_int'] = os.path.join(dbId, root[len(dbBaseDir) + 1:])
                meta['body'] = unicode(
                    open(os.path.join(root, 'fulltext.txt'), 'r').read(),
                    'utf8', 'ignore').encode('utf8')
                meta['references'] = None  # TODO add
                art = Article.Article(record=meta)
                db.insertArticle(art)
            except Exception, e:
                logging.warning(
                    'invalid entries in %s; ignoring article (%s)' % (root, e))
                continue
Example #8
0
def merge(inputs, dbout, acceptLanguage='any'):
    logging.info('merging %i databases, accepting "%s" language' %
                 (len(inputs), acceptLanguage))

    db_merged = ArticleDB.ArticleDB(common.dbFile(dbout, acceptLanguage),
                                    mode='override')
    lang_failed = 0
    for dbId, dbBaseDir in inputs.iteritems():
        db_part = ArticleDB.ArticleDB(common.dbFile('gensim', dbId),
                                      mode='open')
        logging.info("processing %i articles from %s" %
                     (len(db_part.db), dbId))
        inserted = 0
        for rec in db_part.db:
            if acceptLanguage == 'any' or rec['language'] == acceptLanguage:
                db_merged.insertArticle(Article.Article(rec))
                inserted += 1
            else:
                lang_failed += 1
        logging.info("accepted %i articles of %s language from %s" %
                     (inserted, acceptLanguage, dbId))
    db_merged.commit()

    logging.info(
        '%i total articles in the merged database; %i rejected due to different language'
        % (len(db_merged), lang_failed))
Example #9
0
    def layer1(self, articles):
        spacy_instance = spaCy.SpaCy()

        for index in range(0, len(articles)):
            article = Article.Article(articles[index], spacy_instance)
            article.look_for_entities()
            self.articles.append(article)
Example #10
0
    def nytimes(self, soup):
        if not self.author:
            a = soup.find("meta", {"name": "byl"})
            if a:
                self.author = ' '.join(a['content'].split()[1:])

        if not self.date:
            d = soup.find("meta", {"name": "pdate"})
            if d:
                self.date = convertDate(d['content'], "%Y%m%d")

        if not self.images:
            i = soup.find(itemprop="image")
            if i:
                i = i.get("content")
                self.images.append(i)

        text = ''
        container = soup.find("section", {"name": "articleBody"})
        if container:
            #paragraphs = soup.select("p.css-1ebnwsw.e2kc3sl0")
            paragraphs = container.find_all("p")
            if paragraphs:
                for p in paragraphs:
                    text += (p.text + '\n\n')

        if text == '':
            print("Rejected - likely bad scraping job")
            return None
        else:
            article = Article(self.title, self.author, self.date, self.url,
                              self.source, text.strip(), self.images)
            return article
def loadArts(dbFile='main_cmj.pdl'):
    import ArticleDB
    import Article
    import common
    global db, arts
    db = ArticleDB.ArticleDB(common.dataFile(dbFile), mode='open')
    arts = [Article.Article(rec) for rec in db.db if rec['id_int'] in rdocids]
Example #12
0
def createMscsDb():
    """Create MSC database of all languages."""
    db = ArticleDB.ArticleDB(ARTS_FILE, mode='override', autocommit=False)
    baseDir = ''

    proc_total = 0
    logging.info("processing directory %s" % common.inputPath(baseDir))
    for root, dirs, files in os.walk(common.inputPath(baseDir)):
        root = os.path.normpath(root)
        if os.path.basename(root).startswith('#'):
            proc_total += 1
            try:
                meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml'))
                try:
                    meta['body'] = open(os.path.join(root,
                                                     'fulltext.txt')).read()
                except Exception, e:
                    meta['body'] = None
                meta['id_int'] = root[len(common.INPUT_PATH) + 1:]
                meta['references'] = None  # TODO add
                art = Article.Article(record=meta)
                db.insertArticle(art)
            except Exception, e:
                logging.warning(
                    'invalid entries in %s; ignoring article (%s)' % (root, e))
                continue
Example #13
0
    def jdsupra(self, soup):
        if not self.author:
            a = soup.select_one("div.f6.silver.db.dn-l.mt2.tc-ns a")
            if a:
                self.author = a.text

        if not self.date:
            d = soup.find("time")
            if d:
                self.date = convertDate(d.text, "%B %d, %Y")

        text = ''
        container = soup.find("div", {"class": "jds-main-content"})
        if container:
            paragraphs = container.find_all(["p", "h2"])
            if paragraphs:
                for p in paragraphs:  # differentiating between paragraphs and headers - if <p>, separate by double newline; if <h2>, separate by single newline
                    if p.name == "p":
                        text += (p.text.strip() + '\n\n')
                    else:
                        text += (p.text.strip() + '\n')

        if text == '':
            print("Text is empty - likely bad scraping job")
            return None
        else:
            article = Article(self.title, self.author, self.date, self.url,
                              self.source, text.strip(), self.images)
            return article
def getRefFiles(rootdir):
    result = {}
    for root, dirs, files in os.walk(rootdir):
        for file in files:
            if file == 'references.xml':
                id = Article.idFromDir(root)
                result[os.path.join(root, file)] = id
    return result
def getRefFiles(rootdir):
    result = {}
    for root, dirs, files in os.walk(rootdir):
        for file in files:
            if file == 'references.xml':
                id = Article.idFromDir(root)
                result[os.path.join(root, file)] = id
    return result
def test_if_sorted():
    with open('./articles/test.txt','w') as t:
        t.write('c b a')
    with open('./old.txt','w') as t:
        t.write('b')
    CONFIG = safe_IO.load_json('./FAIDK.config')
    article = Article.Article(CONFIG, 'test.txt', '2')
    assert article.new_words== ['a','c']
Example #17
0
def test_basic(db):
    """
    Basic model saving with required fields should not fail
    """
    blog = Blog(
        title="Foo",
    )
    blog.save()

    article = Article(
        blog=blog,
        title="Bar",
    )
    article.full_clean()
    article.save()

    url = "/{blog_pk}/{article_pk}/".format(
        blog_pk=blog.id,
        article_pk=article.id
    )

    assert 1 == Article.objects.filter(title="Bar").count()
    assert "Bar" == article.title
    assert blog == article.blog
    assert url == article.get_absolute_url()
Example #18
0
def test_required_fields(db):
    """
    Basic model validation with missing required files should fail
    """
    blog = Blog(
        title="Foo",
    )
    blog.save()

    article = Article()

    with pytest.raises(ValidationError) as excinfo:
        article.full_clean()

    assert excinfo.value.message_dict == {
        "blog": ["This field cannot be null."],
        "title": ["This field cannot be blank."],
    }
 def init_articles(self, raw_dict):
     # 头条
     article_first = art.Article(raw_dict['app_msg_ext_info'], self.biz,
                                 self.nickname, self.datetime,
                                 self.standardtime)
     # 如果未被删除
     if article_first.del_flag != 1:
         self.articles[article_first.idx] = article_first
     # 分栏文章
     other_dict = raw_dict['app_msg_ext_info']['multi_app_msg_item_list']
     num = raw_dict['app_msg_ext_info']['multi_app_msg_item_list'].__len__()
     i = 0
     while i < num:
         article = art.Article(other_dict[i], self.biz, self.nickname,
                               self.datetime, self.standardtime)
         if article.del_flag != 1:
             self.articles[article.idx] = article
         i += 1
Example #20
0
 def getArticles(self):
     indent = '<div class=\"NiLAwe y6IFtc R7GTQ keNKEd j7vNaf nID9nc\">'
     endent = '</article>'
     self.out = []
     while self.code.find(indent) != -1:
         self.code = self.code[self.code.find(indent):]
         self.out.append(Article.Article(
             self.code[:self.code.find(endent)]))
         self.code = self.code[self.code.find(endent):]
     return self.out
    def insertArticle(self, art):
        """insert article into database, with id consistency check"""
        present = []
        if art.id_int != None:
            present.extend(self.db._id_int[art.id_int])
#        if art.idZBL != None:
#            present.extend(self.db._idZBL[art.idZBL])
#        if art.idMR != None:        
#            present.extend(self.db._idMR[art.idMR])
        ids = list(set([rec['__id__'] for rec in present])) # unique ids
        present = [self.db[id] for id in ids] # remove duplicate identical entries (coming from matches on more than one id on the same article)
        new = art
        for old in present: # FIXME HACK turns off consistency checking
            try:
                new.mergeWith(Article.Article(record = old)) # article already present in database -- check if ids are consistent, update it with new info from art
            except Exception, e:
#                logging.error('inconsistent database contents (%i overlapping records); leaving database unchanged' % (len(present)))
                #logging.info('inconsistency between \n%s\n%s' % (new, Article.Article(old)))
                logging.warning('inconsistency between %s and %s' % (new, Article.Article(old)))
Example #22
0
def scrape(websiteName):
    response = requests.get(websiteName)
    soup = BeautifulSoup(response.content, "html.parser")
    elements=[]

    if websiteName == "https://www.news.google.com":
        print("google")
        main_tag = 'div'
        main_attrs = {'jscontroller':'d0DtYd'}
        title_tag = 'a'
        title_attrs = {'class':'DY5T1d'}
        newssite_tag = 'a'
        newssite_attrs = {'class':'wEwyrc AVN2gc uQIVzc Sksgp'}
        date_tag = 'time'
    elif websiteName == "https://uk.news.yahoo.com":
        print("yahoo")
        main_tag = 'div'
        main_attrs = {'class':'Ov(h) Pend(44px) Pstart(25px)'}
        title_tag = 'a'
        title_attrs = {'class':'Fw(b) Fz(20px) Lh(23px) Fz(17px)--sm1024 Lh(19px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'}
        newssite_tag = 'div'
        newssite_attrs = {'class':'C(#959595) Fz(11px) D(ib) Mb(6px)'}
        date_tag = 'div'
    elif websiteName == "https://www.bing.com/news":
        print("bing")
        main_tag = 'div'
        main_attrs = {'class':'caption'}
        title_tag = 'a'
        title_attrs = {'class':'title'}
        newssite_tag = 'a'
        newssite_attrs = {'aria-label':re.compile('Search news from .*')}
        date_tag = 'span'
    else:
        print("not predefined website")
        article = Article(websiteName)
        article.download
        for a in soup.findAll('p'):
            elements.append(a.text)
        return elements


    # with open("temp.txt", 'w') as file:
    #     file.write(soup.prettify().encode('utf-8'))
    #print(soup.prettify())

    for a in soup.findAll(main_tag, attrs=main_attrs):
        #print("in findAll loop")
        title=a.find(title_tag, attrs=title_attrs).text
        newssite=a.find(newssite_tag, attrs=newssite_attrs).text
        date=a.find(date_tag).text

        el={"title": title, "newssite": newssite, "date": date}
        elements.append(el)

    return elements
Example #23
0
def create_class_object(doc_type, mas):
    obj = None
    if doc_type == 'book':
        obj = Book.Book(*mas)
    elif doc_type == 'AV':
        obj = AV_materials.AVmaterial(*mas)
    elif doc_type == 'article':
        obj = Article.JournalArticle(*mas)
    elif doc_type == 'user':
        obj = User.User(*mas)
    return obj
Example #24
0
def update():
    '''
    定时更新操作 ,从数据库中获取一系列的待更新URL
    然后将更新信息保存到数据库中
    :param article_list: 待更新URL
    :return: 
    '''
    # article_list='从数据库函数中返回待更新的URL列表'
    # for url in article_list:
    #         time.sleep(300)
    try:

        times = time.localtime(time.time()-24*60*60)
        lastdate = "%s%02d%s" % (times[0], int(times[1]), times[2])
        article_tuple = articleDao.getArticleByDate(lastdate)
        article_list = []

        for tuple_ in article_tuple:
                list_ = []
                for _ in tuple_:
                    list_.append(_)
                if list_[3] !="":
                    article_list.append(list_)

        list_update = []
        print article_list
        for list__ in article_list:
            print list__[3]
            result = parse_html(list__[3])
            if isinstance(result, dict):
                list__[-3] = result['reads']
                list__[-2] = result['likes']

                print 'success'
            else:
                print 'error'
                print result
            list_update.append(list__)
            time.sleep(30)

        print list_update
        for a in list_update:
            article = Article.Article()
            article.id = a[0]
            article.article_name = a[1]
            article.article_title = a[2]
            article.article_url = a[3]
            article.read = a[4]
            article.like = a[5]
            article.last_edit_time = a[6]
            articleDao.modifyArticle(article)
        print "update success"
    except Exception as e:
        print e.message
Example #25
0
def getNewEntity(entityName):
    obj = None
    if (entityName == 'articles'):
        obj = Article()
    if (entityName == 'customers'):
        obj = Customer()
    if (entityName == 'suppliers'):
        obj = Supplier()
    #if (entityName == 'purchases'):
    #    obj = Article()
    #if (entityName == 'sales'):
    #    obj = Article()
    return obj
def test_if_learn_quit():
    clear_all()
    with open('./articles/test.txt','w') as t:
        t.write('a b c d')
    CONFIG = safe_IO.load_json('./FAIDK.config')
    article = Article.Article(CONFIG, 'test.txt', '1')
    with open('old.txt') as o:
        assert o.read() == '\na'
    with open('new.txt') as n:
        assert n.read() == 'b'
    with open('./articles/l_test.txt') as n:
        assert n.read() == 'c\nd'
    clear_all()
Example #27
0
def streamNews(language):
    streamedNews = []
    for sourceName in language.getSources():
        link = 'http://newsapi.org/v2/top-headlines?sources={0}&apiKey=c351e7c333d74c3ca1d882732176a67e'.format(sourceName)
        get = request.urlopen(link)     #Getting articles data

        content = json.loads(get.read().decode())

        for article in content['articles']:             #Creating articles list
            try:
                art = Article.Article(article, language.getStopWords(), False)
                streamedNews.append(art)
            except:
                pass
    return streamedNews     
Example #28
0
    def genericScraper(self):
        config = newspaper.Config()
        config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14'
        #config.request_timeout = 15

        if self.source not in [
                "washingtonpost", "usnews"
        ]:  # washingtonpost and usnews get funky when you set a user agent for some reason (WaPo fails if the timeout isn't long, usnews throws a 403)
            a = newspaper.Article(self.url, config=config)
        else:
            a = newspaper.Article(self.url)
        try:  # make sure page download goes smoothly
            a.download()
            a.parse()
        except Exception as e:
            print("Rejected - DOWNLOAD ERROR: ", e)
            return None

        text = cleanText(a.text)
        if len(
                text
        ) < 500:  # not much article text - full article is likely not picked up, and worst case scenario a short article is rejected (probably not all that useful in the long run)
            print(
                "Rejected - Article text was less than 500 characters, likely bad scraping job"
            )
            return None

        # get title, author, date and images as necessary
        if not self.title:
            if a.title:
                self.title = a.title

        if not self.author:
            if a.authors:
                self.author = a.authors[0]

        if not self.date:
            if a.publish_date:
                self.date = a.publish_date.strftime("%Y-%m-%d")

        if not self.images:
            if a.top_image:
                self.images.append(a.top_image)

        article = Article(self.title, self.author, self.date, self.url,
                          self.source, text.strip(), self.images)
        return article
Example #29
0
def ParseArticleList(Root):
    Path = '//div[@class="r-ent"]'
    ArticleNodes = Root.xpath(Path)
    ArticleList = []
    for ArticleNode in ArticleNodes:
        Title = ArticleNode.xpath('div[@class="title"]/a/text()')
        if IsNullNode(Title):
            continue
        Title = Title[0]
        if "本文已被刪除" in Title:
            continue

        ArticleUrl = ArticleNode.xpath('div[@class="title"]/a')
        if IsNullNode(ArticleUrl):
            continue
        ArticleUrl = ArticleUrl[0].attrib['href']

        Push = ArticleNode.xpath('div[@class="nrec"]/span/text()')
        if IsNullNode(Push):
            Push = 0
        else:
            Push = Push[0]

        Mark = ArticleNode.xpath('div[@class="mark"]/text()')
        if IsNullNode(Mark):
            Mark = ""
        else:
            Mark = Mark[0]

        Author = ArticleNode.xpath('div[@class="meta"]/div[1]/text()')
        if IsNullNode(Author):
            continue
        Author = Author[0]

        PostDate = ArticleNode.xpath('div[@class="meta"]/div[2]/text()')
        if IsNullNode(PostDate):
            continue
        PostDate = PostDate[0]

        ArticleList.append(
            Article(Title=Title,
                    Meta=Meta(Author, PostDate),
                    Push=Push,
                    Mark=Mark,
                    ContentUrl=ArticleUrl))
    return ArticleList
Example #30
0
    def search(self, kensu):
        print('Search start')
        at = Article.Article()
        for i in range(1, kensu + 1):
            # ここでページの切り替え処理
            self.driver.get(self.url + '/latest/page/' + str(i))
            lists = self.driver.find_elements_by_class_name("list-group")
            for list in lists:
                hrefs = list.find_elements_by_tag_name("a")
                # hrefsの展開
                for href in hrefs:
                    # hrefの中にあるhrefタグを格納
                    self.link_list.append(href.get_attribute("href"))

            # ここでArticleを読んで各URLの詳細情報・棋譜情報・評価情報をもらう必要がある
            at.article(self.link_list)
            self.link_list = []
        self.driver.quit()
def parseReferences(reffile, downloadMSC=False):
    """parse references file (references.xml) and return references as a list of Article objects"""
    global attempts_all, attempts_success, attempts_failed
    reflist = []  # the result
    f = open(reffile, 'r')
    for line in f:
        line = line.strip()
        if line.startswith('<title>'):
            art = Article.Article()
            art.title = line.replace("<title>", "").replace("</title>", "")
            line = f.next().strip()
            while not line.startswith("</reference>"):
                if line.startswith("<link "):
                    type = line[line.find('source="') + len('source="'):]
                    type = type[:type.find('"')]
                    id = line[line.find('id="') + len('id="'):]
                    id = id[:id.find('"')]
                    if type == "zbl":
                        art.idZBL = id
                    elif type == "mref":
                        art.idMR = id
                    else:
                        #                        logging.debug("unknown source in %s: %s" % (reffile, type))
                        pass
                    line = f.next().strip()
                    if type in ['zbl', 'mref'] and downloadMSC:
                        url = line[1:line.rfind("</link>")]
                        msc = findmsc.findmsc(url)
                        attempts_all += 1
                        if not msc:
                            attempts_failed += 1
                            logging.warning(
                                "could not retrieve any MSC from url: %s" %
                                url)
                        else:
                            attempts_success += 1
                            if art.msc == None:
                                art.msc = []
                            art.msc.extend(msc)
                            art.msc = list(set(art.msc))
                line = f.next().strip()
            reflist.append(art)  # add the article into result
    f.close()
    return reflist
 def post_article(self, title, url, user_id, comm_id):
     article = Article(title, url, user_id, comm_id)
     article_id = article.get_id()
     self.id_to_article[article_id] = article
     self.article_ids.append(article_id)
     return article_id