def Insert_DB(fin): files = get_files_in_path(fin, 'csv') print(files) for file in files: print (ext_print("Processing file: " + file)) # read input data rows = csv.reader(open(file, 'r', encoding='utf-8')) for row in rows: pubDate = row[2] if pubDate != '2014': # To split tasks into differnt machines with MySQL, process tables separatly continue table = "article_" + pubDate # if (row[2] == '' or row[2] is None): PubDate = 0 # PubDate = int(PubDate) # table = 'article_0-1950' # if 2000 >= PubDate >= 1951: # table = 'article_1951-2000' # if 2005 >= PubDate >= 2001: # table = 'article_2001-2005' # elif PubDate > 2005: # table = 'article_'+ str(PubDate) param = (row[0], row[1], pubDate, row[3], row[4], row[5]) sql = "INSERT INTO `" + table + "` (`PMID`, `JournalTitle`, `PubDate`, `ArticleTitle`, `Abstract`, `Keywords`) VALUES(%s, %s, %s, %s, %s, %s);" try: dbCur.execute(sql, param) except Exception as e: print("Error occurs when executing SQL: " + str(e)) conn.commit() print (ext_print('All tasks completed\n')) return True
def Insert_DB(fin): files = get_files_in_path(fin, 'csv') for file in files: print (ext_print("Processing file: " + file)) sql = "LOAD DATA LOCAL INFILE '" + file + r"' INTO TABLE article_2014 FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\"' LINES TERMINATED BY '\r\n'" try: executeResult = dbCur.execute(sql) print(executeResult) except Exception as e: print("Error occurs when executing SQL: " + str(e)) conn.commit() print (ext_print('All tasks completed\n')) return True
def CT_extractxml(fin, fout=None): if fout is None: fout = fin processed_list = [] # set processed trials into here to avoid redundancy files = fileutils.get_files_in_path(fin, 'xml') for file in files: if file in processed_list: continue print(ext_print("Processing file: " + file)) processed_list.append(file) if len(processed_list) % 1000 == 0: print(ext_print('Processing %d' % len(processed_list))) # read input data text = open(file, 'r', encoding='utf-8').read() if text is not None: ct_xml = xml_parser.fromstring(text) blocks = ct_xml.findall('MedlineCitation') ct_xml = "" for block in blocks: tempContent = extract_component(block) pubDate = tempContent[2] if (pubDate is None or pubDate == ''): year = 0 else: year = int(pubDate) table = 'article_0-1950' if 2000 >= year >= 1951: table = 'article_1951-2000' elif 2005 >= year >= 2001: table = 'article_2001-2005' elif year > 2005: table = 'article_' + str(year) else: table = 'article_unkown_year' outputFile = os.path.join(fout, table + ".csv") fileutils.write_csv_by_row(outputFile, tempContent) print(ext_print('All tasks completed\n')) return True
''' Created on 2017-09-09 @author: lockon ''' import time from Pubmed.extract_utils.dbutils import get_DB_conn from Pubmed.extract_utils.logutils import ext_print #Find all the articles by year, the default year is 2014 def query_articles_by_year(year = '2014'): conn = get_DB_conn() cur = conn.cursor() executeResult = cur.execute("SELECT * FROM article_" + year) resultList = cur.fetchmany(executeResult) cur.close() conn.close() return resultList if __name__ == '__main__': print (ext_print('Start to Process')) start = time.clock() articles = query_articles_by_year('2014') for item in articles: print(item) end = time.clock() print(ext_print("Total cost time: " + str(end - start) + "s"))