def create_all_tables(flotte_db): mycursor = flotte_db.cursor() Article.create_Article_tb(mycursor) Commande.create_Commande_tb(mycursor) Table.create_Table_tb(mycursor) Type.create_Type_tb(mycursor) Robot.create_Robot_tb(mycursor) Positions.create_Pose_tb(mycursor)
def __init__(self, url): """Create a LinkDownloader. Arguments: url -- The URL of the article to download. """ self.url = url self.article = Article(url)
class ArticleDownloader(object): def __init__(self, url): """Create a LinkDownloader. Arguments: url -- The URL of the article to download. """ self.url = url self.article = Article(url) def download_article(self): self.article.download_and_parse()
def parse_article(filename): title, text = XmlParser.parse_article(filename) intro, unnamed_sections, named_sections = TextParser.parse_text(text) uncited_intro, citations = ContentParser.parse_content(intro.content) section_metas = {} uncited_sections = [] for section in unnamed_sections: # Map section level to other section meta information section_meta = section.meta section_meta_level = section_meta.level if not section_meta_level in section_metas: section_metas[section_meta_level] = [] section_metas[section_meta_level].append(section_meta) # Split section content into citations and actual content uncited_section, section_citations = ContentParser.parse_content( section.content) uncited_sections.append(uncited_section) citations = citations + section_citations return Article.Article(title, intro.meta, uncited_intro, section_metas, uncited_sections, named_sections, citations)
def __init__(self, path): self.name = os.path.basename(path) self.path = path self.stopWords = [] self.sources = [] self.articles = [] #Loading language from file langFile = open(path, "r") line = langFile.readline().rstrip() if (line == "Stopwords{"): line = langFile.readline().rstrip() while (line != "}"): self.stopWords.append(line) #Creating stopwords list line = langFile.readline().rstrip() line = langFile.readline().rstrip() if (line == "Sources{"): line = langFile.readline().rstrip() while (line != "}"): self.sources.append(line) #Creating sources list line = langFile.readline().rstrip() line = langFile.readline().rstrip() if (line == "Articles{"): line = langFile.readline().rstrip() while (line != "}"): self.articles.append( Article.Article(line, self.stopWords, True)) #Creating articles list line = langFile.readline().rstrip() langFile.close()
def print_mp_msg(msg): print("MP") print(msg) print msg.raw try: if msg.type == SHARING: dict_msg = msg.raw # 获取推送文章信息 dict article_name = msg.chat.name # 获取公众号名称 article_title = msg.text # 文章标题 sharing_rawurl = dict_msg['Url'] times = time.localtime(dict_msg['CreateTime']) # 发言时间戳 dt = "%s%02d%s" % (times[0], int(times[1]), times[2]) # 格式化时间 article = Article.Article() article.article_name = article_name article.article_title = article_title article.last_edit_time = dt print "raw_url:", sharing_rawurl article.article_url = sharing_rawurl result = WxArticle.parse_html(sharing_rawurl) if isinstance(result, dict): article.read = result['reads'] article.like = result['likes'] articleDao.addArticle(article) else: article.article_url = "" articleDao.addArticle(article) print result except Exception as e: print e.message
def create_maindb(dbId, dbBaseDir): """Look for any subdirs (arbitrary level under dbBaseDir) that start with '#'. From these subdirs, try to load ./fulltext.txt and ./meta.xml files, create an article and insert it into database. Store the database persistently as PyDbLite (pickle) file 'gensim_dbId.pdl'. """ dbFname = common.dbFile('gensim', dbId) logging.info("opening database %s" % dbFname) db = ArticleDB.ArticleDB(dbFname, mode='override', autocommit=False) proc_total = 0 logging.info("processing database %s, directory %s" % (dbId, dbBaseDir)) for root, dirs, files in os.walk(dbBaseDir): root = os.path.normpath(root) if os.path.basename(root).startswith('#'): proc_total += 1 try: meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml')) #meta = {'msc' : []} #meta['id_int'] = Article.idFromDir(root) meta['id_int'] = os.path.join(dbId, root[len(dbBaseDir) + 1:]) meta['body'] = unicode( open(os.path.join(root, 'fulltext.txt'), 'r').read(), 'utf8', 'ignore').encode('utf8') meta['references'] = None # TODO add art = Article.Article(record=meta) db.insertArticle(art) except Exception, e: logging.warning( 'invalid entries in %s; ignoring article (%s)' % (root, e)) continue
def merge(inputs, dbout, acceptLanguage='any'): logging.info('merging %i databases, accepting "%s" language' % (len(inputs), acceptLanguage)) db_merged = ArticleDB.ArticleDB(common.dbFile(dbout, acceptLanguage), mode='override') lang_failed = 0 for dbId, dbBaseDir in inputs.iteritems(): db_part = ArticleDB.ArticleDB(common.dbFile('gensim', dbId), mode='open') logging.info("processing %i articles from %s" % (len(db_part.db), dbId)) inserted = 0 for rec in db_part.db: if acceptLanguage == 'any' or rec['language'] == acceptLanguage: db_merged.insertArticle(Article.Article(rec)) inserted += 1 else: lang_failed += 1 logging.info("accepted %i articles of %s language from %s" % (inserted, acceptLanguage, dbId)) db_merged.commit() logging.info( '%i total articles in the merged database; %i rejected due to different language' % (len(db_merged), lang_failed))
def layer1(self, articles): spacy_instance = spaCy.SpaCy() for index in range(0, len(articles)): article = Article.Article(articles[index], spacy_instance) article.look_for_entities() self.articles.append(article)
def nytimes(self, soup): if not self.author: a = soup.find("meta", {"name": "byl"}) if a: self.author = ' '.join(a['content'].split()[1:]) if not self.date: d = soup.find("meta", {"name": "pdate"}) if d: self.date = convertDate(d['content'], "%Y%m%d") if not self.images: i = soup.find(itemprop="image") if i: i = i.get("content") self.images.append(i) text = '' container = soup.find("section", {"name": "articleBody"}) if container: #paragraphs = soup.select("p.css-1ebnwsw.e2kc3sl0") paragraphs = container.find_all("p") if paragraphs: for p in paragraphs: text += (p.text + '\n\n') if text == '': print("Rejected - likely bad scraping job") return None else: article = Article(self.title, self.author, self.date, self.url, self.source, text.strip(), self.images) return article
def loadArts(dbFile='main_cmj.pdl'): import ArticleDB import Article import common global db, arts db = ArticleDB.ArticleDB(common.dataFile(dbFile), mode='open') arts = [Article.Article(rec) for rec in db.db if rec['id_int'] in rdocids]
def createMscsDb(): """Create MSC database of all languages.""" db = ArticleDB.ArticleDB(ARTS_FILE, mode='override', autocommit=False) baseDir = '' proc_total = 0 logging.info("processing directory %s" % common.inputPath(baseDir)) for root, dirs, files in os.walk(common.inputPath(baseDir)): root = os.path.normpath(root) if os.path.basename(root).startswith('#'): proc_total += 1 try: meta = utils_iddb.parseMeta(os.path.join(root, 'meta.xml')) try: meta['body'] = open(os.path.join(root, 'fulltext.txt')).read() except Exception, e: meta['body'] = None meta['id_int'] = root[len(common.INPUT_PATH) + 1:] meta['references'] = None # TODO add art = Article.Article(record=meta) db.insertArticle(art) except Exception, e: logging.warning( 'invalid entries in %s; ignoring article (%s)' % (root, e)) continue
def jdsupra(self, soup): if not self.author: a = soup.select_one("div.f6.silver.db.dn-l.mt2.tc-ns a") if a: self.author = a.text if not self.date: d = soup.find("time") if d: self.date = convertDate(d.text, "%B %d, %Y") text = '' container = soup.find("div", {"class": "jds-main-content"}) if container: paragraphs = container.find_all(["p", "h2"]) if paragraphs: for p in paragraphs: # differentiating between paragraphs and headers - if <p>, separate by double newline; if <h2>, separate by single newline if p.name == "p": text += (p.text.strip() + '\n\n') else: text += (p.text.strip() + '\n') if text == '': print("Text is empty - likely bad scraping job") return None else: article = Article(self.title, self.author, self.date, self.url, self.source, text.strip(), self.images) return article
def getRefFiles(rootdir): result = {} for root, dirs, files in os.walk(rootdir): for file in files: if file == 'references.xml': id = Article.idFromDir(root) result[os.path.join(root, file)] = id return result
def test_if_sorted(): with open('./articles/test.txt','w') as t: t.write('c b a') with open('./old.txt','w') as t: t.write('b') CONFIG = safe_IO.load_json('./FAIDK.config') article = Article.Article(CONFIG, 'test.txt', '2') assert article.new_words== ['a','c']
def test_basic(db): """ Basic model saving with required fields should not fail """ blog = Blog( title="Foo", ) blog.save() article = Article( blog=blog, title="Bar", ) article.full_clean() article.save() url = "/{blog_pk}/{article_pk}/".format( blog_pk=blog.id, article_pk=article.id ) assert 1 == Article.objects.filter(title="Bar").count() assert "Bar" == article.title assert blog == article.blog assert url == article.get_absolute_url()
def test_required_fields(db): """ Basic model validation with missing required files should fail """ blog = Blog( title="Foo", ) blog.save() article = Article() with pytest.raises(ValidationError) as excinfo: article.full_clean() assert excinfo.value.message_dict == { "blog": ["This field cannot be null."], "title": ["This field cannot be blank."], }
def init_articles(self, raw_dict): # 头条 article_first = art.Article(raw_dict['app_msg_ext_info'], self.biz, self.nickname, self.datetime, self.standardtime) # 如果未被删除 if article_first.del_flag != 1: self.articles[article_first.idx] = article_first # 分栏文章 other_dict = raw_dict['app_msg_ext_info']['multi_app_msg_item_list'] num = raw_dict['app_msg_ext_info']['multi_app_msg_item_list'].__len__() i = 0 while i < num: article = art.Article(other_dict[i], self.biz, self.nickname, self.datetime, self.standardtime) if article.del_flag != 1: self.articles[article.idx] = article i += 1
def getArticles(self): indent = '<div class=\"NiLAwe y6IFtc R7GTQ keNKEd j7vNaf nID9nc\">' endent = '</article>' self.out = [] while self.code.find(indent) != -1: self.code = self.code[self.code.find(indent):] self.out.append(Article.Article( self.code[:self.code.find(endent)])) self.code = self.code[self.code.find(endent):] return self.out
def insertArticle(self, art): """insert article into database, with id consistency check""" present = [] if art.id_int != None: present.extend(self.db._id_int[art.id_int]) # if art.idZBL != None: # present.extend(self.db._idZBL[art.idZBL]) # if art.idMR != None: # present.extend(self.db._idMR[art.idMR]) ids = list(set([rec['__id__'] for rec in present])) # unique ids present = [self.db[id] for id in ids] # remove duplicate identical entries (coming from matches on more than one id on the same article) new = art for old in present: # FIXME HACK turns off consistency checking try: new.mergeWith(Article.Article(record = old)) # article already present in database -- check if ids are consistent, update it with new info from art except Exception, e: # logging.error('inconsistent database contents (%i overlapping records); leaving database unchanged' % (len(present))) #logging.info('inconsistency between \n%s\n%s' % (new, Article.Article(old))) logging.warning('inconsistency between %s and %s' % (new, Article.Article(old)))
def scrape(websiteName): response = requests.get(websiteName) soup = BeautifulSoup(response.content, "html.parser") elements=[] if websiteName == "https://www.news.google.com": print("google") main_tag = 'div' main_attrs = {'jscontroller':'d0DtYd'} title_tag = 'a' title_attrs = {'class':'DY5T1d'} newssite_tag = 'a' newssite_attrs = {'class':'wEwyrc AVN2gc uQIVzc Sksgp'} date_tag = 'time' elif websiteName == "https://uk.news.yahoo.com": print("yahoo") main_tag = 'div' main_attrs = {'class':'Ov(h) Pend(44px) Pstart(25px)'} title_tag = 'a' title_attrs = {'class':'Fw(b) Fz(20px) Lh(23px) Fz(17px)--sm1024 Lh(19px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled'} newssite_tag = 'div' newssite_attrs = {'class':'C(#959595) Fz(11px) D(ib) Mb(6px)'} date_tag = 'div' elif websiteName == "https://www.bing.com/news": print("bing") main_tag = 'div' main_attrs = {'class':'caption'} title_tag = 'a' title_attrs = {'class':'title'} newssite_tag = 'a' newssite_attrs = {'aria-label':re.compile('Search news from .*')} date_tag = 'span' else: print("not predefined website") article = Article(websiteName) article.download for a in soup.findAll('p'): elements.append(a.text) return elements # with open("temp.txt", 'w') as file: # file.write(soup.prettify().encode('utf-8')) #print(soup.prettify()) for a in soup.findAll(main_tag, attrs=main_attrs): #print("in findAll loop") title=a.find(title_tag, attrs=title_attrs).text newssite=a.find(newssite_tag, attrs=newssite_attrs).text date=a.find(date_tag).text el={"title": title, "newssite": newssite, "date": date} elements.append(el) return elements
def create_class_object(doc_type, mas): obj = None if doc_type == 'book': obj = Book.Book(*mas) elif doc_type == 'AV': obj = AV_materials.AVmaterial(*mas) elif doc_type == 'article': obj = Article.JournalArticle(*mas) elif doc_type == 'user': obj = User.User(*mas) return obj
def update(): ''' 定时更新操作 ,从数据库中获取一系列的待更新URL 然后将更新信息保存到数据库中 :param article_list: 待更新URL :return: ''' # article_list='从数据库函数中返回待更新的URL列表' # for url in article_list: # time.sleep(300) try: times = time.localtime(time.time()-24*60*60) lastdate = "%s%02d%s" % (times[0], int(times[1]), times[2]) article_tuple = articleDao.getArticleByDate(lastdate) article_list = [] for tuple_ in article_tuple: list_ = [] for _ in tuple_: list_.append(_) if list_[3] !="": article_list.append(list_) list_update = [] print article_list for list__ in article_list: print list__[3] result = parse_html(list__[3]) if isinstance(result, dict): list__[-3] = result['reads'] list__[-2] = result['likes'] print 'success' else: print 'error' print result list_update.append(list__) time.sleep(30) print list_update for a in list_update: article = Article.Article() article.id = a[0] article.article_name = a[1] article.article_title = a[2] article.article_url = a[3] article.read = a[4] article.like = a[5] article.last_edit_time = a[6] articleDao.modifyArticle(article) print "update success" except Exception as e: print e.message
def getNewEntity(entityName): obj = None if (entityName == 'articles'): obj = Article() if (entityName == 'customers'): obj = Customer() if (entityName == 'suppliers'): obj = Supplier() #if (entityName == 'purchases'): # obj = Article() #if (entityName == 'sales'): # obj = Article() return obj
def test_if_learn_quit(): clear_all() with open('./articles/test.txt','w') as t: t.write('a b c d') CONFIG = safe_IO.load_json('./FAIDK.config') article = Article.Article(CONFIG, 'test.txt', '1') with open('old.txt') as o: assert o.read() == '\na' with open('new.txt') as n: assert n.read() == 'b' with open('./articles/l_test.txt') as n: assert n.read() == 'c\nd' clear_all()
def streamNews(language): streamedNews = [] for sourceName in language.getSources(): link = 'http://newsapi.org/v2/top-headlines?sources={0}&apiKey=c351e7c333d74c3ca1d882732176a67e'.format(sourceName) get = request.urlopen(link) #Getting articles data content = json.loads(get.read().decode()) for article in content['articles']: #Creating articles list try: art = Article.Article(article, language.getStopWords(), False) streamedNews.append(art) except: pass return streamedNews
def genericScraper(self): config = newspaper.Config() config.browser_user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14' #config.request_timeout = 15 if self.source not in [ "washingtonpost", "usnews" ]: # washingtonpost and usnews get funky when you set a user agent for some reason (WaPo fails if the timeout isn't long, usnews throws a 403) a = newspaper.Article(self.url, config=config) else: a = newspaper.Article(self.url) try: # make sure page download goes smoothly a.download() a.parse() except Exception as e: print("Rejected - DOWNLOAD ERROR: ", e) return None text = cleanText(a.text) if len( text ) < 500: # not much article text - full article is likely not picked up, and worst case scenario a short article is rejected (probably not all that useful in the long run) print( "Rejected - Article text was less than 500 characters, likely bad scraping job" ) return None # get title, author, date and images as necessary if not self.title: if a.title: self.title = a.title if not self.author: if a.authors: self.author = a.authors[0] if not self.date: if a.publish_date: self.date = a.publish_date.strftime("%Y-%m-%d") if not self.images: if a.top_image: self.images.append(a.top_image) article = Article(self.title, self.author, self.date, self.url, self.source, text.strip(), self.images) return article
def ParseArticleList(Root): Path = '//div[@class="r-ent"]' ArticleNodes = Root.xpath(Path) ArticleList = [] for ArticleNode in ArticleNodes: Title = ArticleNode.xpath('div[@class="title"]/a/text()') if IsNullNode(Title): continue Title = Title[0] if "本文已被刪除" in Title: continue ArticleUrl = ArticleNode.xpath('div[@class="title"]/a') if IsNullNode(ArticleUrl): continue ArticleUrl = ArticleUrl[0].attrib['href'] Push = ArticleNode.xpath('div[@class="nrec"]/span/text()') if IsNullNode(Push): Push = 0 else: Push = Push[0] Mark = ArticleNode.xpath('div[@class="mark"]/text()') if IsNullNode(Mark): Mark = "" else: Mark = Mark[0] Author = ArticleNode.xpath('div[@class="meta"]/div[1]/text()') if IsNullNode(Author): continue Author = Author[0] PostDate = ArticleNode.xpath('div[@class="meta"]/div[2]/text()') if IsNullNode(PostDate): continue PostDate = PostDate[0] ArticleList.append( Article(Title=Title, Meta=Meta(Author, PostDate), Push=Push, Mark=Mark, ContentUrl=ArticleUrl)) return ArticleList
def search(self, kensu): print('Search start') at = Article.Article() for i in range(1, kensu + 1): # ここでページの切り替え処理 self.driver.get(self.url + '/latest/page/' + str(i)) lists = self.driver.find_elements_by_class_name("list-group") for list in lists: hrefs = list.find_elements_by_tag_name("a") # hrefsの展開 for href in hrefs: # hrefの中にあるhrefタグを格納 self.link_list.append(href.get_attribute("href")) # ここでArticleを読んで各URLの詳細情報・棋譜情報・評価情報をもらう必要がある at.article(self.link_list) self.link_list = [] self.driver.quit()
def parseReferences(reffile, downloadMSC=False): """parse references file (references.xml) and return references as a list of Article objects""" global attempts_all, attempts_success, attempts_failed reflist = [] # the result f = open(reffile, 'r') for line in f: line = line.strip() if line.startswith('<title>'): art = Article.Article() art.title = line.replace("<title>", "").replace("</title>", "") line = f.next().strip() while not line.startswith("</reference>"): if line.startswith("<link "): type = line[line.find('source="') + len('source="'):] type = type[:type.find('"')] id = line[line.find('id="') + len('id="'):] id = id[:id.find('"')] if type == "zbl": art.idZBL = id elif type == "mref": art.idMR = id else: # logging.debug("unknown source in %s: %s" % (reffile, type)) pass line = f.next().strip() if type in ['zbl', 'mref'] and downloadMSC: url = line[1:line.rfind("</link>")] msc = findmsc.findmsc(url) attempts_all += 1 if not msc: attempts_failed += 1 logging.warning( "could not retrieve any MSC from url: %s" % url) else: attempts_success += 1 if art.msc == None: art.msc = [] art.msc.extend(msc) art.msc = list(set(art.msc)) line = f.next().strip() reflist.append(art) # add the article into result f.close() return reflist
def post_article(self, title, url, user_id, comm_id): article = Article(title, url, user_id, comm_id) article_id = article.get_id() self.id_to_article[article_id] = article self.article_ids.append(article_id) return article_id