예제 #1
0
def Insert_DB(fin):
    files = get_files_in_path(fin, 'csv')
    print(files)
    for file in files:
        print (ext_print("Processing file: " + file))

        # read input data
        rows = csv.reader(open(file, 'r', encoding='utf-8'))
        for row in rows:
            pubDate = row[2]
            if pubDate != '2014':  # To split tasks into differnt machines with MySQL, process tables separatly
                continue
            table = "article_" + pubDate

            # if (row[2] == '' or row[2] is None): PubDate = 0
            # PubDate = int(PubDate)
            # table = 'article_0-1950'
            # if 2000 >= PubDate >= 1951:
            #     table = 'article_1951-2000'
            # if 2005 >= PubDate >= 2001:
            #     table = 'article_2001-2005'
            # elif PubDate > 2005:
            #     table = 'article_'+ str(PubDate)
            
            param = (row[0], row[1], pubDate, row[3], row[4], row[5])
            sql = "INSERT INTO `" + table + "` (`PMID`, `JournalTitle`, `PubDate`, `ArticleTitle`, `Abstract`, `Keywords`) VALUES(%s, %s, %s, %s, %s, %s);"
            try:
                dbCur.execute(sql, param)
            except Exception as e:
                print("Error occurs when executing SQL: " + str(e))
            conn.commit()           
    print (ext_print('All tasks completed\n'))
    return True
예제 #2
0
def Insert_DB(fin):
    files = get_files_in_path(fin, 'csv')
    for file in files:
        print (ext_print("Processing file: " + file))
        sql = "LOAD DATA LOCAL INFILE '" + file + r"' INTO TABLE article_2014 FIELDS TERMINATED BY ',' OPTIONALLY ENCLOSED BY '\"' ESCAPED BY '\"' LINES TERMINATED BY '\r\n'"
        try:
            executeResult =  dbCur.execute(sql)
            print(executeResult)
        except Exception as e:
            print("Error occurs when executing SQL: " + str(e))
        conn.commit()
             
    print (ext_print('All tasks completed\n'))
    return True
예제 #3
0
def CT_extractxml(fin, fout=None):
    if fout is None:
        fout = fin
    processed_list = []  # set processed trials into here to avoid redundancy

    files = fileutils.get_files_in_path(fin, 'xml')
    for file in files:
        if file in processed_list:
            continue
        print(ext_print("Processing file: " + file))
        processed_list.append(file)
        if len(processed_list) % 1000 == 0:
            print(ext_print('Processing  %d' % len(processed_list)))

        # read input data
        text = open(file, 'r', encoding='utf-8').read()
        if text is not None:
            ct_xml = xml_parser.fromstring(text)
            blocks = ct_xml.findall('MedlineCitation')
            ct_xml = ""
            for block in blocks:
                tempContent = extract_component(block)
                pubDate = tempContent[2]
                if (pubDate is None or pubDate == ''):
                    year = 0
                else:
                    year = int(pubDate)

                table = 'article_0-1950'
                if 2000 >= year >= 1951:
                    table = 'article_1951-2000'
                elif 2005 >= year >= 2001:
                    table = 'article_2001-2005'
                elif year > 2005:
                    table = 'article_' + str(year)
                else:
                    table = 'article_unkown_year'

                outputFile = os.path.join(fout, table + ".csv")
                fileutils.write_csv_by_row(outputFile, tempContent)

    print(ext_print('All tasks completed\n'))
    return True
예제 #4
0
'''
Created on 2017-09-09

@author: lockon
'''

import time
from Pubmed.extract_utils.dbutils import get_DB_conn
from Pubmed.extract_utils.logutils import ext_print

#Find all the articles by year, the default year is 2014
def query_articles_by_year(year = '2014'):
    conn = get_DB_conn()
    cur = conn.cursor()
    executeResult = cur.execute("SELECT * FROM article_" + year)
    resultList = cur.fetchmany(executeResult)
    cur.close()
    conn.close() 
    return resultList

if __name__ == '__main__':
    print (ext_print('Start to Process'))
    start = time.clock()
    
    articles = query_articles_by_year('2014')
    for item in articles:
        print(item)

    end = time.clock()
    print(ext_print("Total cost time: " + str(end - start) + "s"))