def doc_nc_lines(bill_number): lines = [] query_string = """ SELECT `Art_Cod_Logico`, `DFA_Cantidad`, `DFA_PV_SinImp`, `DFA_Descuento`, `DFA_Monto_IV`, `DFA_Porc_IV`, `DFA_Precio_Venta` FROM DET_NDFact WHERE `NENC_ConsLogico` = ? """ self.current_connection = AccessConnection() if self.current_connection.status: query_output, result = self.current_connection.run_query( query_string, (str(bill_number), )) if result: for counter, row in enumerate(query_output): current_article = Article(row[0]) article_dictionary = current_article.get_article_data() lines.append({ 'numero_linea': counter + 1, 'codigo': row[0], 'cantidad': row[1], 'detalle': article_dictionary.get("description"), 'precio': row[2], 'descuento': row[3], 'impuesto': row[4], 'porcentaje_impuesto': row[5], 'total': row[6], }) return lines
def writeArticlesToFile(self): """ Extracts and transforms each article from BeautifulSoup to an Article obj, then writes the article text to file """ success = 0 errors = 0 base = "https://www.thedailystar.net/" for article in self.articles_raw: try: article_title = article.h5.text article_url = base + article.a['href'] response = scraper.makeSpoofedRequest(article_url) article_soup = BeautifulSoup(response.text, 'html.parser') articleObj = Article(self.section_title, article_title,article_soup, self.date) articleObj.writeToFile() success += 1 except Exception as e: print(f"Error writing {article_title}") errors += 1 continue print(f"Wrote {success} articles to file in {self.section_title}") print(f"Could not write {errors} articles.") self._log(success, errors)
def GetReferenceList(seedArticle, databaseFile = None, graphFile = None): global global_identification_value seedArticle.link = seedArticle.link.replace('articleDetails', 'abstractReferences') html = GetHTMLFromLink(driver, seedArticle.link) references = GetReferencesFromHTML(html) articleList = [] for ref in references: try : article = Article() article.title = GetTitleFromRef(ref) html = GetHTMLSearchIEEEByName(webdriver.Firefox(), article.title) article.link = GetSearchLinkFromArticleName(html, article.title) article.identification = parseIdentificationFromLink(article.link) articleList.append(article) # if (databaseFile is not None and graphFile is not None): # AppendDatabaseFromMap([article], databaseFile, graphFile) print article.identification print article.title print "\n" except : continue return articleList
def read_articles(fn='Text/nhk_easy.txt', if_article=True, if_para=True, if_sentence=True): f = open(fn) articles ={} line_match = re.compile(r'(k\d{14})\s{4}(.*)\n') for line in f: match = line_match.match(line) if match: news_id = match.group(1) text = match.group(2) if if_article: articles[news_id] = Article(news_id, text) if not if_para: continue paras = re.split(' ',text) for pid in xrange(len(paras)): news_para_id = news_id + '_para' + str(pid + 1) if len(paras[pid].strip()) > 0: articles[news_para_id] = Article(news_para_id, paras[pid].strip()) #print news_para_id, paras[pid] if not if_sentence: continue sentences = re.split('。', paras[pid].strip()) for sid in xrange(len(sentences)): news_para_sentence_id = news_para_id + '_s' + str(sid + 1) if (len(sentences[sid].strip())) > 0: articles[news_para_sentence_id] = Article(news_para_sentence_id, sentences[sid].strip() + '。') #print news_para_sentence_id, sentences[sid].strip() return articles
def update_index(db): print('updating main page...') payload = {'country': 'US', 'apiKey': 'eb4ad8625c5b4f57bb62f8c95601038a'} r = requests.get('https://newsapi.org/v2/top-headlines', params=payload) raw_json = r.json() index_articles = db["articles"] index_articles.delete_many({'is_index': 1}) for item in raw_json['articles']: try: article = Article(item['url']) article.build() index_articles.insert_one({ 'source': article.source_url, 'title': article.title, 'url': article.url, 'topImage': article.topImage, 'text': article.text, 'keywords': article.keywords, 'tags': article.tags, 'category': article.category, 'time': article.time, 'is_index': 1 }) except: print('pass this article.') print('update finished!')
def preprocess_article(article: Article): ner_types = ["PERSON", "NORP", "ORG", "GPE", "LOC"] title = article.title title = re.sub("(?i)COVID-19", "coronavirus", title) title = re.sub("(?i)COVID19", "coronavirus", title) title = re.sub("(?i)COVID", "coronavirus", title) title = title.split() article.title_clean = " ".join([w for w in title if w not in STOP_WORDS]) article.nlp_title_clean = nlp(article.title_clean) article.title_clean_lemmatized = lemmatizer(article.nlp_title_clean) text = article.text text = re.sub("(?i)COVID-19", "coronavirus", text) text = re.sub("(?i)COVID19", "coronavirus", text) text = re.sub("(?i)COVID", "coronavirus", text) text = text.split() article.text_clean = " ".join([w for w in text if w not in STOP_WORDS]) # article.nlp_text_clean = nlp(article.text_clean) # article.text_clean_lemmatized = lemmatizer(article.nlp_text_clean) # article.title_text_named_entities = [ent.text for ent in article.nlp_title_clean.ents if ent.label_ in ner_types] + [ent.text for ent in article.nlp_text_clean.ents if ent.label_ in ner_types] return article
def LoadMapFromDatabase(databaseFileName, graphFileName): articleMap = {} #load database row by row, creating Article objects into the map with open(databaseFileName, 'r') as database: databasereader = csv.reader(database, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in databasereader: if row is None or len(row)<3 or len(row[0])==0: continue article = Article() article.identification = int(row[0]) article.title = row[1] article.link = row[2] articleMap[article.identification] = article #load references with open(graphFileName, 'r') as graph: graphreader = csv.reader(graph, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in graphreader: if len(row) < 2: continue articleMap[int(row[0])].references = [] for x in row[1:]: if len(x)>0 and int(x) in articleMap.keys(): articleMap[int(row[0])].references.append(articleMap[int(x)]) #return map return articleMap
def setUp(self): """called before the first test case of this unit begins""" self.article = Article( 'www.cnn.com/2018/09/25/health/iyw-girl-named-florence-collects-donations-trnd/index.html' ) self.invalid_article = Article('i am an invalid string')
def addPosting(self,file='',REQUEST=None,RESPONSE=None, index=1): """ add an article """ id=self.createId() msg=Article(id) err, sage = msg.__of__(self)._validation(REQUEST,RESPONSE,'delete attachment',file) if err: return err # Set thread number. msg.tnum = '1' self.ids.insert(id) self.data[id]=msg if index: msg.__of__(self).index() if RESPONSE: return self.showMessage(self, REQUEST=REQUEST, title='Article Posted', message ='Your article has been posted', action=self.absolute_url() ) return id
def parsingNews(self, url): ret = [] links = self.parseLinkArtcle(url) for v in links: (title, text) = self.news.crawling(v) art = Article(self.getStringFilter(title), self.getStringFilter(text), "한국경제") print("title : ", title) print("text : ", text) ret.append(art.toDic()) return ret
def get_article(self, title, url): ''' Return an Article object after examining the article for company information ''' try: company_url, company_name = self.parse_article(url) article = Article(title, url, company_name, company_url) except (AttributeError, TypeError) as e: article = Article(title, url) return article
def seg_to_article(segment): pattern1 = re.compile(r"<DOCNO>(.*?)</DOCNO>") doc_id = ''.join(pattern1.findall(segment)) pattern2 = re.compile(r'<TEXT>(.*?)</TEXT>', re.M | re.S) doc_text = ''.join(pattern2.findall(segment)) art = Article(doc_id, doc_text) if doc_id == '' or doc_text == '': print art.to_string() print segment return art
def parsingNews(self, url): ret = [] links = self.parseLinkArtcle(self.url1) for v in links: try: (title, text) = self.news.crawling(v) print('title : ', title) print("text : ", text) art = Article(self.util.getStringFilter(title), self.util.getStringFilter(text), "매일경제") ret.append(art.toDic()) except: print('new crawling error ') return ret
def convert_text_to_articles(fn='Text/nhk_easy.txt', if_article=True, if_para=True, if_sentence=True): old_articles = read_articles() f = open(fn) articles = {} line_match = re.compile(r'(k\d{14})\s{4}(.*)\n') for line in f: match = line_match.match(line) if match: news_id = match.group(1) text = match.group(2) if if_article: articles[news_id] = Article(news_id, text) if not if_para: continue paras = re.split(' ', text) for pid in xrange(1, len(paras)): news_para_id = news_id + '_para' + str(pid) if len(paras[pid].strip()) > 0: articles[news_para_id] = Article(news_para_id, paras[pid].strip()) # print news_para_id, paras[pid] if not if_sentence: continue sentences = re.split('。', paras[pid].strip()) for sid in xrange(len(sentences)): news_para_sentence_id = news_para_id + '_s' + str(sid + 1) if (len(sentences[sid].strip())) > 0: articles[news_para_sentence_id] = Article( news_para_sentence_id, sentences[sid].strip() + '。') # print news_para_sentence_id, sentences[sid].strip() ############################################## # Keep old_articles, combine them into new one for doc_id in old_articles.keys(): if not articles.has_key(doc_id): articles[doc_id] = old_articles[doc_id] ############################################## f = codecs.open('Text/nhk_easy_articles.txt', 'w', 'utf-8') for article in articles.values(): f.write(json.dumps(article.__dict__) + '\n') f.close()
class ArticleTestCase(unittest.TestCase): def runTest(self): self.test_url() self.test_source_url() self.test_download_html() self.test_parse_html() def setUp(self): """called before the first test case of this unit begins""" self.article = Article( 'www.cnn.com/2018/09/25/health/iyw-girl-named-florence-collects-donations-trnd/index.html' ) def tearDown(self): """called after all test cases finish of this unit""" pass def test_url(self): assert self.article.url == '/2018/09/25/health/iyw-girl-named-florence-collects-donations-trnd/index.html' def test_source_url(self): assert self.article.source_url == 'http://www.cnn.com' request = requests.get(self.article.source_url + self.article.url) assert request.status_code == 200 def test_download_html(self): self.article.download() assert len(self.article.html) > 5000 def test_parse_html(self): """check whether parser function can use GooseObj correctly""" TOP_IMG = 'https://cdn.cnn.com/cnnnext/dam/assets/180925092633-03-iyw-wisniewski-trnds-large-169.jpg' TITLE = "4-year-old Florence didn't like sharing her name with a bad hurricane. So she did something about it." KEYWORDS = [ 'health', "4-year-old Florence didn't like sharing her name with a bad hurricane. So she did something about it. - CNN" ] AUTHOR = ['Christopher Dawson, CNN'] self.article.download() self.article.parse() assert self.article.top_image == TOP_IMG assert self.article.title == TITLE assert self.article.keywords == KEYWORDS # assert self.article.author == AUTHOR def test_time(self): self.article.download() self.article.parse() assert self.article.time == 2.5
def parsingNews(self, url): ret = [] links = self.parseLinkArtcle(url) for v in links: (title, text) = self.news.crawling(v) text = text.replace("저작권자 © 시사경제신문 무단전재 및 재배포 금지", "") art = Article(self.getStringFilter(title), self.getStringFilter(text), "시사경제") print("title : ", title) print("text : ", text) ret.append(art.toDic()) print(art.toDic()) return ret
def parse_xml(self, context): if 'Journal' in context: #to check if url contains an XML with article abstract = context[context.index("<Abstract>")+10:context.index("</Abstract>")] title = context[context.index("<ArticleTitle>")+14:context.index("</ArticleTitle>")] return Article(abstract, title) else: return None
def parse_article(url): # given a url, get page content req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) data = urlopen(req).read() # parse as html structured document bs = BeautifulSoup(data,'html.parser') # kill javascript content # print (bs.prettify()) for s in bs.findAll('script'): s.replaceWith('') new_article = Article(bs) new_article.add_author() new_article.add_title() return new_article
def parse_article(self): title = self.title() author = self.author_name() url = self.article_url tag_list = self.tags() return Article(title, author, url, tag_list)
def process_articles(): pages = 0 # wiki_categories = obtain_categories(config_db.get("host"), config_db.get("db"), config_db.get("user"), # config_db.get("passwd"), # min=config_categories.get("articles_min"), # max=config_categories.get("articles_max")) # file = open("./test_articles.txt", "w") with open(dataset, "r") as data: for line in data: elements = line.split(";") article_id = elements[0] article_title = elements[1] categories_part = elements[3][1:-2] categories_part = categories_part.replace("\'", "") article_categories = [ category.strip() for category in categories_part.split(",") ] # article_categories = [c for c in elements[3] if c in wiki_categories] if article_categories: article = Article(id=article_id, categories=article_categories) process_article(article, article_title) # linea = str.format("{} ; {} ; {}\n", article_title, str(clean_text(article_title)), str(article_categories)) # file.write(linea) del article pages += 1 if pages % 10000 == 0: print("Processed pages = ", pages)
def query_candidates(doc): min = MinHash(num_perm=128) keyword = doc.keyword.split(",") for k in keyword: time.sleep(2) # print(k) trans_text = translate_yandex(str(k), src="vi", dest="en").encode("utf-8") print(trans_text) min.update(trans_text) # result = forest.query(min, 3) result = lsh.query(min) result = ",".join(result) if not result: print(doc.title) print("----------------------------------------------") print("Not found") print("\n") else: docs = mydb.execute_query( "SELECT id, keyword, title FROM english WHERE id IN (" + result + ")") titles = [ Article(id=item[0], keyword=item[1], title=item[2]) for item in docs ] print(doc.title) print("----------------------------------------------") for i in titles: print(i.title) print("\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--mode', '-m', help='work mode\n 1 for build mode \n 2 for find mode \n default None', default=None) parser.add_argument( '--config', help='use local config', dest='config', default=True, action='store_true') parser.add_argument('--no-config', help='not use local config', dest='config', action='store_false') parser.add_argument('--file','-f', help="not working") parser.add_argument('--floder','-F', help='not working') args = parser.parse_args() CONFIG = safe_IO.load_json('./FAIDK.config') FLAG = safe_IO.check_flag(args.mode) if FLAG == 'q': logger.info('user exit') return safe_IO.check_output_file(CONFIG['MAIN_PATH'] + CONFIG['NEW_WORDS_PATH']) # NEM_WORDS_ALL = set() # if args.file is not None: # FILE_NAMES = [args.file] # logger.info('using args.file: '+','.join(FILE_NAMES)) # else: FILE_NAMES = safe_IO.get_name( CONFIG['MAIN_PATH'] + CONFIG['ARTICLES_PATH']) safe_IO.try_make_dir(CONFIG['MAIN_PATH'] + CONFIG['OLD_ARTICLES_PATH']) for file in FILE_NAMES: # article = Article(CONFIG, file, FLAG)
def get_articles(): articles = [] links = [] headers = [] topics = [] authors = [] dates = [] for i in range(0, 4032, 29): curr_url = BASE_URL + '/?start=' + str(i) response = urllib.request.urlopen(curr_url) html = response.read().decode('utf-8') soup = BeautifulSoup(html, 'lxml') headers, topics = get_headers_and_topics(soup, headers, topics) authors += get_authors(soup) dates += get_dates(soup) # modifier headers, links = make_headers_and_links(headers, links) for i, link in enumerate(links): try: articles.append( Article(link, headers[i], topics[i], authors[i], dates[i])) except: print('i:', i, 'link:', link, 'end of the link') print(headers[i]) print(len(headers), len(topics), len(authors), len(dates)) print(authors[i - 1]) __import__('sys').stdout.flush() return articles
def extract_article(self, block): title = block.getText() if title: return Article(title) return None
def scrape_latest_updates(self): """ scrapes all articles from the 'Latest Updates' section """ articles = [] soup = bs(self.driver.page_source, 'html.parser') soup = soup.find("h2", {"id": "latest-updates"}).find_parent('div') elements = soup.find_all("li", {"class": "lx-stream__post-container"}) for el_soup in elements: url = self.get_url( el_soup.find("a", {"class": "qa-story-cta-link"})) if url is None or url[0:6] != '/news/': continue title = self.get_text( el_soup.find("h3", {"class": "lx-stream-post__header-title"})) text = self.get_text( el_soup.find("p", {"class": "lx-stream-related-story--summary"})) date = self.get_date( el_soup.find("span", {"class": "qa-visually-hidden-meta"})) img = self.get_src( el_soup.find( "img", {"class": "lx-stream-related-story--index-image"})) # FILTER articles to be scrapped # We will only take articles which have a url (to scrap a full content only) # and that are on the news section of the website articles.append(Article(title, text, date, url, img)) return articles
def parse_article(article): # Get article date date_div = article.find('div', class_='views-field-created-1') date_txt = date_div.find('div', class_='post-day').text + ' ' + date_div.find( 'div', class_='post-month').text # Get title title_div = article.find('div', class_='views-field-title') title_txt = title_div.find('a').text # Get author author_div = article.find('div', class_='views-field-name') author_txt = author_div.find('span', class_='field-content').text # Get body body_div = article.find('div', class_='views-field-body') body_txt = body_div.find('p').text # Get link link_div = article.find('div', class_='views-field-view-node') link_txt = link_div.find('a').text link_href = link_div.find('a')['href'] # Return Article object return Article(date_txt, title_txt, author_txt, body_txt, link_href)
def add_articles_to_current_clusters(API_URL, selected_ungrouped_article_id_list): news_groups = requests.get( f"{API_URL}/news?should_get_articles_and_id_only=true").json() news_groups_with_preprocessed_articles = [] for ng in news_groups: news_group_with_preprocessed_articles = [] for article_id in ng["articles"]: news_group_with_preprocessed_articles.append( preprocess_article( Article( requests.get( f"{API_URL}/articles/{article_id}").json()))) news_groups_with_preprocessed_articles.append( news_group_with_preprocessed_articles) ungrouped_article_ids = requests.get( f"{API_URL}/articles/?is_grouped=false&should_get_features_for_preprocessing=true" ).json() ungrouped_articles = [ Article(a) for a in ungrouped_article_ids if a['_id'] in selected_ungrouped_article_id_list ] ungrouped_preprocessed_articles = [ preprocess_article(a) for a in ungrouped_articles ] updated_news_group_ids = set() for i in range(len(news_groups_with_preprocessed_articles)): for x in ungrouped_preprocessed_articles: if x.is_grouped or is_news_not_belongs_to_group( x, news_groups_with_preprocessed_articles[i]): continue print(f'Has same topic, {news_groups[i]["_id"]}, {x.id}') updated_news_group_ids.add(news_groups[i]["_id"]) news_groups[i]["articles"].append(x.id) x.is_grouped = True print(f'updated_news_group_ids: {list(updated_news_group_ids)}') for ng in news_groups: ng_id = ng["_id"] if ng_id in updated_news_group_ids: requests.put(f"{API_URL}/news/{ng_id}", json={"articles": ng["articles"]}) return list(updated_news_group_ids)
def get_articles(self): column_name = self.get_column_name() scroll_loader = ScrollLoader( "get", "http://zhuanlan.zhihu.com/api/columns/" + column_name + "/posts?limit=10", 10) from Article import Article for response in scroll_loader.run(): yield Article("http://zhuanlan.zhihu.com" + response)
def __init__(self, url, headers=None, request=None, soup=None): self.url = url self.headers = headers self.request = requests.get(self.url, headers=self.headers) self.soup = BeautifulSoup(self.request.content, 'html.parser') self.articles = [ Article(article_html) for article_html in self.soup.find_all('article') ]
def StartFromSeed(seedLink, seedTitle): SeedArticle = Article() SeedArticle.link = seedLink SeedArticle.title = seedTitle SeedArticle.identification = parseIdentificationFromLink(SeedArticle.link) databasefile = "database.csv" graphfile = "graph.csv" SeedArticle.references = GetReferenceList(SeedArticle, databaseFile = databasefile, graphFile = graphfile) mapToInsert = {} mapToInsert[SeedArticle.identification] = SeedArticle for art in SeedArticle.references: mapToInsert[art.identification] = art AppendDatabaseFromMap(mapToInsert, databasefile, graphfile) print 'done'
def insertIDF(table_store): articles = mydb.execute_query("SELECT id, content, word FROM " + table_store) list_articles = [Article(item[0], item[1], item[2]) for item in articles] contents = [item.content for item in list_articles] tf = TfidfVectorizer(use_idf=True) tf.fit_transform(contents) idf = tf.idf_ pdb.set_trace() mydb.insert("INSERT INTO idf (" + str(table_store) + ") VALUES (%s)", (idf))
def counter(table, word = False): articles = mydb.execute_query("SELECT id, content, word, keyword FROM " + table) list_articles = [Article(item[0], item[1], item[2], item[3]) for item in articles] if(word): contents = [item.word for item in list_articles] else: contents = [item.content for item in list_articles] cv = CountVectorizer(stop_words = get_stopwords(table)) return cv, cv.fit_transform(contents)
def test_get_article_without_company_returns_article_obj(self): expected_article = Article( url=self.article_with_no_company, title=self.article_no_co_title ) ret = self.parser.get_article(self.article_no_co_title, self.article_with_no_company) self.assertEqual(ret, expected_article)
def parseListHtml(page, titleindex): next_page = {'page': page, 'title': titleindex} common.save_now_page(next_page) mysql = Mysql() s = '' if page > 1: s = '_' + repr(page) print(url.format(titles[titleindex], s)) try: response = requests.get(url.format(titles[titleindex], s), headers=headers, timeout=10) response.encoding = 'gb2312' if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') re_coms = soup.find_all('ul', attrs={'class': 'recom_list'}) articles = [] for re_com in re_coms: article = Article(re_com.a.string, re_com.find('span', attrs={'class': 'gd1'}).a.attrs['href']) article.author = 'OK学车' article.contentHead = parseContentHead(re_com.find('li', attrs={'class': 'recom_nr'}).text) article.type = types[titles[titleindex]] articles.append(article) parseArticle(articles) # 保存到数据库 mysql.insert_array(articles) mysql.close() # common.save_file(titles[titleIndex], '第{0}页'.format(page), repr(common.convert_to_dicts(articles))) sleep_time = random.randint(5, 10) print('休息', sleep_time, 's后再获取') time.sleep(sleep_time) parseListHtml(page + 1, titleindex) else: mysql.close() if titleindex + 1 < len(titles): parseListHtml(1, titleindex + 1) except Exception as e: print(traceback.format_exc()) print('网页获取失败:', e) mysql.close() sleep_time = random.randint(1, 5) print(repr(sleep_time), 's后重新获取') time.sleep(sleep_time) parseListHtml(page + 1, titleindex)
def get_doc_lines(self, bill_number): lines = [] query_string = """ SELECT `Art_Cod_Logico`, `DFA_Cantidad`, `DFA_PV_SinImp`, `DFA_Descuento`, `DFA_Monto_IV`, `DFA_Porc_IV`, `DFA_Precio_Venta`, `DFA_Porc_Exoneracion` FROM DET_FACTURA WHERE `Fenc_Numero` = ? """ self.current_connection = AccessConnection() if self.current_connection.status: query_output, result = self.current_connection.run_query( query_string, (str(bill_number), )) if result: for counter, row in enumerate(query_output): current_article = Article(row[0]) article_dictionary = current_article.get_article_data() lines.append({ 'numero_linea': counter + 1, 'codigo': row[0], 'cantidad': row[1], 'detalle': article_dictionary.get("description"), 'precio': row[2], 'descuento': row[3], 'impuesto': row[4], 'porcentaje_impuesto': row[5], 'total': row[6], 'porcentaje_exoneracion': row[7], 'unidad': article_dictionary.get("unit"), 'codigo_impuesto': article_dictionary.get("iva_code"), 'tarifa_impuesto': article_dictionary.get("iva_tarif"), 'cabys': article_dictionary.get('cabys') }) return lines
def init(): global driver, soup, page_type, page, website driver = webdriver.Chrome() soup = "" page_type = "" page = Page() page = Article() page = YoutubePage() page = YoutubeVideo() website = ""
def generateXML(self): """ Create the XML File """ document = Document() home = document.createElement("SmartCruizerData") document.appendChild(home) for element in self.ArticleList.values(): Article = document.createElement("Article") home.appendChild(Article) """ Heading """ heading = document.createElement("Heading") Article.appendChild(heading) headingValue = document.createTextNode(element.getHeading()) heading.appendChild(headingValue) """ ShortText """ shortText = document.createElement("ShortText") Article.appendChild(shortText) shortTextValue = document.createTextNode(element.getShortText()) shortText.appendChild(shortTextValue) """ Thumbnail """ Thumbnail = document.createElement("Thumbnail") Article.appendChild(Thumbnail) ThumbnailValue = document.createTextNode(element.getThumbnail()) Thumbnail.appendChild(ThumbnailValue) """ Text """ text = document.createElement("Text") Article.appendChild(text) textValue = document.createTextNode(element.getText()) text.appendChild(textValue) return document.toprettyxml(indent="", encoding="utf-8")
else: xapian_name = options.n #set PSQL database name database = options.d #set synonym path synonymPath = options.s #Synonym file to use if not (os.path.isfile(synonymPath)): sys.exit( "synonym file not existing - programme terminates" ) if options.x: #import class Article from Article.py and connect to PostgreSQL database from Article import Article Article.getConnection(database) #select all articles in a range of years x >= b_year and x <= e_year articles = Article.getArticlesByYear(b_year,e_year) Article.closeConnection() print "\n-------------" print "processing files from year " + str(b_year) + " to " + str(e_year) print "-------------" print "got articles from PostgreSQL database" print "-------------" #take the last year to create directory indexer = PubMedXapian(xapian_name, xapianPath = options.xapian_database_path) #build full text index with Xapian for all articles selected before if options.x: print "now indexing articles in Xapian" indexer.buildIndexWithArticles(articles) print "\n-------------"
check_call(["mkdir", image_folder]) pages = list() page_count = 0 for pdf_page in PDFPage.create_pages(document): interpreter.process_page(pdf_page) layout = device.get_result() page = Page(layout, page_number=page_count+1, jpg=page_images[page_count]) page.find_segment_top_neighbors() pages.append( page ) page_count += 1 fp.close() pdfArticle = Article(pages, pdf_name) pdfArticle.find_default_fonts() pdfArticle.find_content_distances() pdfArticle.save_content(style="lines") pdfArticle.concatenate_segments() pdfArticle.identify_num_columns() pdfArticle.identify_sections() pdfArticle.save_images(image_folder) if xml_file != "": if label_mode == "A" or label_mode == "a": pdfArticle.assign_labels(xml_file) pdfArticle.print_label_accuracy() else: feature_vecs = XML_Parser.retrieve_tags(xml_file) feature_vecs.sort(key=lambda x:x[1])
def convertToArticle(self, article): """Takes the parsed Pubmed article and converts it to our lightweight format. :param article: A parsed Pubmed article :rtype: A properly formatted Article Object ( Stored Object ) """ articleObject = Article() # Loop over all the citation data medlineCitation = article["MedlineCitation"] for attr in medlineCitation: if attr == "PMID": articleObject.addAttribute("id", "pubmed", str(medlineCitation["PMID"])) if attr == "DateCreated": day = medlineCitation["DateCreated"]["Day"] month = medlineCitation["DateCreated"]["Month"] year = medlineCitation["DateCreated"]["Year"] dateCreated = self.convertDateToNative(day, month, year) articleObject.addAttribute("dateCreated", "pubmed", dateCreated) if attr == "DateCompleted": day = medlineCitation["DateCompleted"]["Day"] month = medlineCitation["DateCompleted"]["Month"] year = medlineCitation["DateCompleted"]["Year"] dateCompleted = self.convertDateToNative(day, month, year) articleObject.addAttribute("dateCompleted", "pubmed", dateCompleted) if attr == "Article": articleObject.addAttribute("title", "pubmed", medlineCitation["Article"]["ArticleTitle"].encode("utf8")) articleObject.addAttribute( "abstract", "pubmed", medlineCitation["Article"]["Abstract"]["AbstractText"][0].encode("utf8") ) authors = [] for author in medlineCitation["Article"]["AuthorList"]: authors.append("%s %s" % (author["ForeName"], author["LastName"])) articleObject.addAttribute("authors", "pubmed", authors) articleObject.addAttribute("source", "pubmed", str(medlineCitation["Article"]["Journal"]["Title"])) # Loop over all the aspects of the pubmed data for attr in article["PubmedData"]: if attr == "ArticleIdList": for id in article["PubmedData"]["ArticleIdList"]: articleObject.addAttribute("id", id.attributes["IdType"], str(id)) return articleObject