def main(): # get the the path for the input file argument parser = argparse.ArgumentParser() parser._action_groups.pop() required = parser.add_argument_group('required arguments') optional = parser.add_argument_group('optional arguments') required.add_argument('-tr', '--train_image', help='image used for training the algorithm', required=True) required.add_argument('-te', '--test_image', help='image to evaluate', required=True) optional.add_argument('-l', '--log', dest="logLevel", choices=['DEBUG', 'debug', 'INFO', 'info', 'ERROR', 'error'], help='Argument use to set the logging level') optional.add_argument('-knn', '--knn', help='flag to run knn', action='store_true') args = parser.parse_args() logger_initialization(log_level=args.logLevel) logging.getLogger('regular.time').info('starting running handwritten-notes script') digits, y_train = load_digits(args.train_image) x_train = pixels_to_hog_20(digits) num_pixels = x_train.shape[1] num_classes = len(np.unique(y_train)) if args.knn: logging.getLogger('regular.time').info('training knn model') model = KNeighborsClassifier() model.fit(x_train, y_train) else: logging.getLogger('regular.time').info('training NN model') model = Sequential() model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu')) model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax')) # Compile model model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) process_test_image(dataset=args.test_image, model=model, model_type=args.knn)
def main(): # get the the path for the input file argument parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help='dataset file that has not being processed') parser.add_argument('-tr', '--train_file', help='processed training dataset file') parser.add_argument('-te', '--test_file', help='processed testing dataset file') parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper, help="Set the logging level") args = parser.parse_args() logger_initialization(log_level=args.logLevel) training_dir = args.train_file testing_dir = args.test_file # if those variables are not pass, then populate them if not (training_dir or testing_dir): training_dir = 'datasets/train_data_processed.csv' testing_dir = 'datasets/test_data_processed.csv' else: training_dir = 'datasets/' + training_dir + '.csv' testing_dir = 'datasets/' + testing_dir + '.csv' load(train_directory=training_dir, test_directory=testing_dir) logging.getLogger('regular.time').info('starting running pre-processing script') # import data from file dataset = load_data(args.input_file) # calculate relevant variables' values process_dataset(dataset=dataset, train_dir=training_dir, test_dir=testing_dir) # save it store_dataset() logging.getLogger('regular.time').info('finished running pre-processing script')
def main(): """ starts running the script :return: None. """ # get the the path for the input file argument parser = argparse.ArgumentParser() parser.add_argument('-r', '--retrieve', help='arg use to pull data from PubMed', action='store_true') parser.add_argument( '-p', '--process', help='arg use to process the info into paper, author, medical and ' 'title_abstracts records', action='store_true') parser.add_argument('-a', '--analyze', help='run topic modeling on the file', action='store_true') parser.add_argument( '-f', '--file', help= 'file to process. Depending on whether the retrieve, process or analysis ' 'options were selected, there is a different default file') parser.add_argument('-l', '--log', dest='logLevel', choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper, help='Set the logging level') if sys.platform == "darwin" or sys.platform == "win32": if sys.platform == "win32": path = 'D:\dataset\scosy\dataset' else: path = '/Volumes/dataset/scosy/dataset' # Respublica else: path = 'dataset/' args = parser.parse_args() logger_initialization(log_level=args.logLevel) logging.getLogger('line.regular.time.line').info('Running SCOSY') if args.retrieve: logging.getLogger('regular').info('retrieving data from PudMed') # databases such as PubMed, GenBank, GEO, and many others # Use the mandatory email parameter so the NCBI can contact you if there is a proble Entrez.email = "*****@*****.**" # Always tell NCBI who you are logging.getLogger('regular').info( 'searching PubMed for CHOP and UPENN authors') handle = Entrez.esearch( db="pubmed", retmax=100000000, idtype="esearch", mindate="2014/01/01", maxdate="2020/08/21", term= "Perelman School of Medicine[Affiliation] OR Children's Hospital of " "Philadelphia[Affiliation] OR University of Pennsylvania School of " "Medicine[Affiliation] OR School of Medicine University of " "Pennsylvania[Affiliation]", usehistory="y") search_results = Entrez.read(handle) handle.close() # obtaining the list of relevant PMIDs id_list = search_results["IdList"] # get all the record based on the PMIDs # logging.getLogger('regular.time').info('getting relevant authors\' records based on PMIDs') fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline") # need to read all the data from the handle and store in a file because if we just read line by line from the # generator and the internet connection is not strong, then we run into http errors: # http.client.IncompleteRead: IncompleteRead(0 bytes read) result_path = Path(path, 'results.txt') out_handle = result_path.open('w+') out_handle.write(fetch_records_handle.read()) # the results are now in the results.xml file and the original handle has had all of its data extracted # (so we close it) out_handle.close() msg = 'saved authors\' records on local file = {0}'.format(result_path) logging.getLogger('regular.time').info(msg) elif args.process: # import data from file logging.getLogger('regular').info('reading data from result file') file_name = args.file if not file_name: file_name = 'results.txt' result_path = Path(path, file_name) records_handle = result_path.open() fetch_records = parse(handle=records_handle) # initializing variables mesh_description_dict = obtain_descriptions() # contains all the metadata elements on the author level: PubMed unique Identifier number(PMID), AuthorID (as a # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation author_record_df = pd.DataFrame(columns=[ 'PMID', 'Author', 'author_chop', 'author_penn', 'Role', 'AffiliationInfo' ]) # contains all the metadata elements on the paper level: PubMed unique Identifier number(PMID), Title, Abstract, # Year, Month, AuthorList, SubjectList, date paper_record_df = pd.DataFrame(columns=[ 'PMID', 'Title', 'Abstract', 'Year', 'Month', 'author_list', 'subject_list', 'date' ]) # contains all the metadata of the medical information: PubMed unique Identifier number(PMID), Primary Medical # Subject Header (MESH) and the description ID medical_record_df = pd.DataFrame( columns=['PMID', 'Desc', 'Primary_MeSH']) title_list = list() abstract_list = list() # get the relevant information for each record for record_index, record in enumerate(fetch_records): logging.getLogger('regular').debug( 'record index = {0}'.format(record_index)) try: pmid = record.get('PMID') title = record.get('TI') abstract = record.get('AB') authors = record.get('FAU') affiliations = record.get('AD') publication_type = record.get('PT') mesh_term = record.get('MH') date_created = record.get('EDAT') year, month = date_created.split('/')[:2] date = year + '/' + month logging.getLogger('regular').debug('pmid = {0}'.format(pmid)) logging.getLogger('regular').debug('title = {0}'.format(title)) logging.getLogger('regular').debug( 'abstract = {0}'.format(abstract)) logging.getLogger('regular').debug( 'authors = {0}'.format(authors)) logging.getLogger('regular').debug( 'affiliations = {0}'.format(affiliations)) logging.getLogger('regular').debug( 'publication type = {0}'.format(publication_type)) logging.getLogger('regular').debug( 'mesh term = {0}'.format(mesh_term)) logging.getLogger('regular').debug( 'data created = {0}'.format(date_created)) # assign the chief author, ordinary author or principal investigator role to each author roles = assign_roles(authors) # check and assign whether the authors belong to the CHOP or PENN organization chop_organization, penn_organization = assign_organization( affiliations) mesh_description = '' if mesh_term is None: mesh_term = '' else: mesh_description, term = convert_mesh_description( mesh_description_dict, mesh_term) mesh_term = ';'.join(mesh_term) # output information if mesh_description: row = pd.DataFrame( [[pmid, term, mesh_description]], columns=['PMID', 'Primary_MeSH', 'Desc']) medical_record_df = medical_record_df.append( row, ignore_index=True) for author_index, organizations in enumerate( zip(chop_organization, penn_organization)): # check if the author belongs to either CHOP or PENN if 1 in organizations: row = pd.DataFrame([[ pmid, authors[author_index], organizations[0], organizations[1], roles[author_index], affiliations[author_index] ]], columns=[ 'PMID', 'Author', 'author_chop', 'author_penn', 'Role', 'AffiliationInfo' ]) author_record_df = author_record_df.append( row, ignore_index=True) authors = ';'.join(authors) row = pd.DataFrame([[ pmid, title, abstract, year, month, authors, mesh_term, date ]], columns=[ 'PMID', 'Title', 'Abstract', 'Year', 'Month', 'author_list', 'subject_list', 'date' ]) paper_record_df = paper_record_df.append(row) title_list.append(title) abstract_list.append(abstract) except Exception as e: msg = 'Error while processing PMID={0}'.format(pmid) logging.getLogger('regular').debug(msg) msg = 'Exception message = {0}'.format(e) logging.getLogger('regular').debug(msg) # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation author_path = Path(path, 'author_record.csv') author_record_df.to_csv(author_path, index=False) # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract, # Year, Month, AuthorList, SubjectList, date paper_path = Path(path, 'paper_record.csv') paper_record_df.to_csv(paper_path, index=False) # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical # Subject Header (MESH) and the description ID medical_path = Path(path, 'medical_record.csv') medical_record_df.to_csv(medical_path, index=False) # store the record in a file for processing dataset = dict() dataset['title'] = title_list dataset['abstracts'] = abstract_list dataset['mesh'] = mesh_term dataset = pd.DataFrame(dataset) titles_abstracts_mesh_path = Path(path, 'titles_abstracts_mesh.csv') dataset.to_csv(path_or_buf=titles_abstracts_mesh_path, index=False) logging.getLogger('line.regular.time.line').info( 'SCOSY finished running successfully.')
def main(): # get the the path for the input file argument parser = argparse.ArgumentParser() parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper, help="Set the logging level") args = parser.parse_args() logger_initialization(log_level=args.logLevel) logging.getLogger('line.regular.time.line').info( 'Running Recommendation System script') # import data from file logging.getLogger('regular').info('reading data from file') # databases such as PubMed, GenBank, GEO, and many others # Use the mandatory email parameter so the NCBI can contact you if there is a proble Entrez.email = "*****@*****.**" # Always tell NCBI who you are logging.getLogger('regular').info( 'searching pubmed for the CHOP and UPENN authors') handle = Entrez.esearch( db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2017/05/01", term= "Perelman School of Medicine[Affiliation] OR Children's Hospital of " "Philadelphia[Affiliation] OR University of Pennsylvania School of " "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]", usehistory="y") search_results = Entrez.read(handle) handle.close() # obtaining the list of relevant PMIDs id_list = search_results["IdList"] # get all the record based on the PMIDs logging.getLogger('regular').info( 'getting relevant authors\' records based on PMIDs') fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline") # need to read all the data from the handle and store in a file because if we just read line by line from the # generator and the internet connection is not strong, then we run into http errors: # http.client.IncompleteRead: IncompleteRead(0 bytes read) logging.getLogger('regular').info( 'storing authors\' records on local file') with open("results.xml", "w") as out_handle: out_handle.write(fetch_records_handle.read()) # the results are now in the results.xml file and the original handle has had all of its data extracted # (so we close it) fetch_records_handle.close() logging.getLogger('regular').info('reading result files') records_handle = open("results.xml") fetch_records = parse(handle=records_handle) # initializing variables mesh_description_dict = obtain_descriptions() # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation author_record_df = pd.DataFrame(columns=[ 'PMID', 'AuthorID', 'Author CHOP', 'Author PENN', 'ROLE', 'Affiliation' ]) # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract, # Year, Month, AuthorList, SubjectList, date paper_record_df = pd.DataFrame(columns=[ 'PMID', 'Title', 'Abstract', 'Year', 'Month', 'Author List', 'Subject List', 'Date' ]) # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical # Subject Header (MESH) and the description ID medical_record_df = pd.DataFrame(columns=['PMID', 'MESH', 'Description']) title_list = list() abstract_list = list() # get the relevant information for each record for record_index, record in enumerate(fetch_records): logging.getLogger('regular').debug( 'record index = {0}'.format(record_index)) try: pmid = record.get('PMID') title = record.get('TI') abstract = record.get('AB') authors = record.get('FAU') affiliations = record.get('AD') publication_type = record.get('PT') mesh_term = record.get('MH') date_created = record.get('EDAT') year, month = date_created.split('/')[:2] date = year + '/' + month logging.getLogger('regular').debug('pmid = {0}'.format(pmid)) logging.getLogger('regular').debug('title = {0}'.format(title)) logging.getLogger('regular').debug( 'abstract = {0}'.format(abstract)) logging.getLogger('regular').debug('authors = {0}'.format(authors)) logging.getLogger('regular').debug( 'affiliations = {0}'.format(affiliations)) logging.getLogger('regular').debug( 'publication type = {0}'.format(publication_type)) logging.getLogger('regular').debug( 'mesh term = {0}'.format(mesh_term)) logging.getLogger('regular').debug( 'data created = {0}'.format(date_created)) # assign the chief author, ordinary author or principal investigator role to each author roles = assign_roles(authors) # check and assign whether the authors belong to the CHOP or PENN organization chop_organization, penn_organization = assign_organization( affiliations) mesh_description = '' if mesh_term is None: mesh_term = '' else: term, mesh_description = convert_mesh_description( mesh_description_dict, mesh_term) mesh_term = ';'.join(mesh_term) # output information if mesh_description: row = pd.DataFrame([[pmid, term, mesh_description]], columns=['PMID', 'Mesh', 'Description']) medical_record_df = medical_record_df.append(row, ignore_index=True) for author_index, organizations in enumerate( zip(chop_organization, penn_organization)): if 1 in organizations: row = pd.DataFrame([[ pmid, authors[author_index], organizations[0], organizations[1], roles[author_index], affiliations[author_index] ]], columns=[ 'PMID', 'AuthorID', 'Author CHOP', 'Author PENN', 'ROLE', 'Affiliation' ]) author_record_df = author_record_df.append( row, ignore_index=True) authors = ';'.join(authors) row = pd.DataFrame([[ pmid, title, abstract, year, month, authors, mesh_term, date ]], columns=[ 'PMID', 'Title', 'Abstract', 'Year', 'Month', 'Author List', 'Subject List', 'Date' ]) paper_record_df = paper_record_df.append(row) title_list.append(title) abstract_list.append(abstract) except Exception as e: msg = 'Error while processing PMID={0}'.format(pmid) logging.getLogger('regular').debug(msg) msg = 'Exception message = {0}'.format(e) logging.getLogger('regular').debug(msg) # store the record in a file for processing dataset = dict() dataset['title'] = title_list dataset['abstracts'] = abstract_list dataset = pd.DataFrame(dataset) dataset.to_csv(path_or_buf='record_results/titles_abstracts.csv', index=False) # read the records from the file # dataset = pd.read_csv('record_results/titles_abstracts.csv') # topic_modeling(dataset=dataset) pandas.io.formats.excel.header_style = None # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation author_record_df.to_excel('record_results/author_record.xlsx', sheet_name='author_record', index=False) # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract, # Year, Month, AuthorList, SubjectList, date paper_record_df.to_excel('record_results/paper_record.xlsx', sheet_name='paper_record', index=False) # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical # Subject Header (MESH) and the description ID medical_record_df.to_excel('record_results/medical_record.xlsx', sheet_name='medical_record', index=False) logging.getLogger('line.regular.time.line').info( 'Recommendation System script finished running successfully.')
def main(): # ignore warning of compiling tensorflow from source os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # get the the path for the input file argument parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', help='dataset file that has not being processed') parser.add_argument('-tr', '--train_file', help='processed training dataset file') parser.add_argument('-te', '--test_file', help='processed testing dataset file') parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper, help="Set the logging level") parser.add_argument('-cv', '--cross_validation', action='store_true') parser.add_argument('-gs', '--grid_search', action='store_true') parser.add_argument('-svm', '--svm', help='run support vector machine', action='store_true') parser.add_argument('-p', '--processed_dataset', action='store_true', help='this flag is used when the training ' 'and testing datasets are provided') parser.add_argument('-s', '--store_datasets', action='store_true', help='this flag is used to store the training' 'and testing dataset on local system') args = parser.parse_args() logger_initialization(log_level=args.logLevel) logging.getLogger('line.regular.time.line').info('Running No_Show script') # import data from file logging.getLogger('regular').info('reading data from file') tr_data = pd.read_csv(filepath_or_buffer=args.train_file, delimiter='|') te_data = pd.read_csv(filepath_or_buffer=args.test_file, delimiter='|') logging.getLogger('regular').debug('training dataset shape = {0}'.format( tr_data.shape)) logging.getLogger('regular').debug('training dataset keys = {0}'.format( tr_data.keys())) logging.getLogger('regular').debug('testing dataset shape = {0}'.format( te_data.shape)) logging.getLogger('regular').debug('testing dataset keys = {0}'.format( te_data.keys())) y_train_data = tr_data['NOSHOW'].values y_test_data = te_data['NOSHOW'].values x_train_data = tr_data.drop([ 'PATIENT_KEY', 'ENCOUNTER_APPOINTMENT_DATETIME', 'ENCOUNTER_APPOINTMENT_STATUS', 'NOSHOW' ], axis=1).values x_test_data = te_data.drop([ 'PATIENT_KEY', 'ENCOUNTER_APPOINTMENT_DATETIME', 'ENCOUNTER_APPOINTMENT_STATUS', 'NOSHOW' ], axis=1).values # check if cross validation flag is set run_model(training_data=x_train_data, testing_data=x_test_data, training_y=y_train_data, testing_y=y_test_data, svm_flag=args.svm, gs_flag=args.grid_search)
def main(): # get the the path for the input file argument parser = argparse.ArgumentParser() parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper, help="Set the logging level") args = parser.parse_args() logger_initialization(log_level=args.logLevel) logging.getLogger('line.regular.time.line').info( 'Running Recommendation System script') # import data from file logging.getLogger('regular').info('reading data from file') # Entrez (http://www.ncbi.nlm.nih.gov/Entrez) is a data retrieval system that provides users access to NCBI’s # databases such as PubMed, GenBank, GEO, and many others # Use the mandatory email parameter so the NCBI can contact you if there is a proble Entrez.email = "*****@*****.**" # Always tell NCBI who you are # logging.getLogger('regular').info('searching pubmed for the CHOP and UPENN authors') # handle = Entrez.esearch(db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2017/05/01", # term="Perelman School of Medicine[Affiliation] OR Children's Hospital of " # "Philadelphia[Affiliation] OR University of Pennsylvania School of " # "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]", # usehistory="y") # search_results = Entrez.read(handle) # handle.close() # # obtaining the list of relevant PMIDs # id_list = search_results["IdList"] # # # get all the record based on the PMIDs # logging.getLogger('regular').info('getting relevant authors\' records based on PMIDs') # fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline") # # need to read all the data from the handle and store in a file because if we just read line by line from the # # generator and the internet connection is not strong, then we run into http errors: # # http.client.IncompleteRead: IncompleteRead(0 bytes read) # logging.getLogger('regular').info('storing authors\' records on local file') # with open("results.xml", "w") as out_handle: # out_handle.write(fetch_records_handle.read(validate=True)) # # the results are now in the results.xml file and the original handle has had all of its data extracted # # (so we close it) # fetch_records_handle.close() logging.getLogger('regular').info('reading result files') records_handle = open("results.xml") fetch_records = parse(records_handle) # initializing variables mesh_description_dict = obtain_descriptions() # PMID=PubMed Unique Identifier, TI=Title, AB=Abstract, AD=Affiliation, FAU=Full Author, MH=MeSH Terms, # PT=Publication Type # for more information, look at the abbreviations in the /template/abbreviations.txt file author_information = { 'PMID': '', 'TI': '', 'AB': '', 'FAU': '', 'AU': '', 'MH': '', 'PT': '', 'AD': '' } author_list = list() affiliation_list = list() mesh_list = list() first_record = True # get the relevant information for each record for record_index, line in enumerate(fetch_records): logging.getLogger('regular').debug( 'line index = {0}'.format(record_index)) # remove new line delimiter line = line.replace('\n', '') # skip if empty string if not line: continue # getting the key (PMID, TITLE, ABSTRACT, etc) and its value key, value = line.split('- ') # remove spaces key.replace(' ', '') # check if key is relevant to the information of interest if key not in author_information.keys(): continue if key == 'PMID': # if it is not the first record, that means that it is a new record and therefore needs to reset all the # variables if not first_record: author_information['AU'] = author_list author_information['AD'] = affiliation_list author_information['MH'] = mesh_list logging.getLogger('regular').debug( 'authors\' information = {0}'.format(author_information)) # function to print's the author's information to the relevant files # output_author_information(author_information) author_information = dict['PMID':'', 'TI':'', 'AB':'', 'FAU':'', 'AU', 'ROLE':'', 'MH':'', 'PT':'', 'AD':''] author_list = list() affiliation_list = list() # there might be multiple authors per PMID and therefore we need to add them to a list if key == 'FAU': author_list.append(value) # each author might have one or more affiliations elif key == 'AD': affiliation_list.append(value) # there might be multiple mesh terms elif key == 'MH': # some of the mesh terms might have an * that needs to be removed mesh_list.append(value.replace('*', '')) # add the authors' information author_information[key] = value # changing first record flag first_record = False logging.getLogger('line.regular.time.line').info( 'Recommendation System script finished running successfully.')
def main(): # get the the path for the input file argument parser = argparse.ArgumentParser() parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper, help="Set the logging level") args = parser.parse_args() logger_initialization(log_level=args.logLevel) logging.getLogger('line.regular.time.line').info('Running Recommendation System script') # import data from file logging.getLogger('regular').info('reading data from file') # Entrez (http://www.ncbi.nlm.nih.gov/Entrez) is a data retrieval system that provides users access to NCBI’s # databases such as PubMed, GenBank, GEO, and many others # Use the mandatory email parameter so the NCBI can contact you if there is a proble Entrez.email = "*****@*****.**" # Always tell NCBI who you are # logging.getLogger('regular').info('searching pubmed for the CHOP and UPENN authors') # handle = Entrez.esearch(db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2017/05/01", # term="Perelman School of Medicine[Affiliation] OR Children's Hospital of " # "Philadelphia[Affiliation] OR University of Pennsylvania School of " # "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]", # usehistory="y") # search_results = Entrez.read(handle) # handle.close() # # obtaining the list of relevant PMIDs # id_list = search_results["IdList"] # # # get all the record based on the PMIDs # logging.getLogger('regular').info('getting relevant authors\' records based on PMIDs') # fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline") # # need to read all the data from the handle and store in a file because if we just read line by line from the # # generator and the internet connection is not strong, then we run into http errors: # # http.client.IncompleteRead: IncompleteRead(0 bytes read) # logging.getLogger('regular').info('storing authors\' records on local file') # with open("results.xml", "w") as out_handle: # out_handle.write(fetch_records_handle.read(validate=True)) # # the results are now in the results.xml file and the original handle has had all of its data extracted # # (so we close it) # fetch_records_handle.close() logging.getLogger('regular').info('reading result files') records_handle = open("results.xml") fetch_records = parse(handle=records_handle) # initializing variables mesh_description_dict = obtain_descriptions() # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical # Subject Header (MESH) and the description ID medical_record_file = open('record_results/medical_record.csv', 'w') medical_record_file.write('PMID,Primary MeSH,Description\n') # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a # combination of the author’s last name, first name, and initials), institution: chop=0, Penn=1, Role: Chief Author # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation author_record_file = open('record_results/author_record.csv', 'w') author_record_file.write('PMID,Author,Author_CHOP,Author_PENN,Role,Affiliation\n') # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract, # Year, Month, AuthorList, SubjectList, date paper_record_file = open('record_results/paper_record.csv', 'w') paper_record_file.write('PMID,Title,Abstract,Year,Month,Author List,Subject List,Date\n') # get the relevant information for each record for record_index, record in enumerate(fetch_records): logging.getLogger('regular').debug('record index = {0}'.format(record_index)) pmid = record.get('PMID') title = record.get('TI') abstract = record.get('AB') authors = record.get('FAU') affiliations = record.get('AD') publication_type = record.get('PT') mesh_term = record.get('MH') date_created = record.get('EDAT') year, month = date_created.split('/')[:2] date = year + '/' + month logging.getLogger('regular').debug('pmid = {0}'.format(pmid)) logging.getLogger('regular').debug('title = {0}'.format(title)) logging.getLogger('regular').debug('abstract = {0}'.format(abstract)) logging.getLogger('regular').debug('authors = {0}'.format(authors)) logging.getLogger('regular').debug('affiliations = {0}'.format(affiliations)) logging.getLogger('regular').debug('publication type = {0}'.format(publication_type)) logging.getLogger('regular').debug('mesh term = {0}'.format(mesh_term)) logging.getLogger('regular').debug('data created = {0}'.format(date_created)) # assign the chief author, ordinary author or principal investigator role to each author roles = assign_roles(authors) # check and assign whether the authors belong to the CHOP or PENN organization chop_organization, penn_organization = assign_organization(affiliations) mesh_description = '' if mesh_term is None: mesh_term = '' else: term, mesh_description = convert_mesh_description(mesh_description_dict, mesh_term) mesh_term = ';'.join(mesh_term) # output information if mesh_description: msg = print_str(pmid, term, mesh_description) medical_record_file.write(msg) for author_index, organizations in enumerate(zip(chop_organization, penn_organization)): if 1 in organizations: msg = print_str(pmid, authors[author_index], organizations[0], organizations[1], roles[author_index], affiliations[author_index]) author_record_file.write(msg) authors = ';'.join(authors) msg = print_str(pmid, title, abstract, year, month, authors, mesh_term, date) paper_record_file.write(msg) if record_index == 10: break # closing all open files medical_record_file.close() author_record_file.close() paper_record_file.close() logging.getLogger('line.regular.time.line').info('Recommendation System script finished running successfully.')
def main(): # get the the path for the input file argument parser = argparse.ArgumentParser() parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], type=str.upper, help="Set the logging level") args = parser.parse_args() logger_initialization(log_level=args.logLevel) logging.getLogger('line.regular.time.line').info( 'Running Recommendation System script') # import data from file logging.getLogger('regular').info('reading data from file') # Entrez (http://www.ncbi.nlm.nih.gov/Entrez) is a data retrieval system that provides users access to NCBI’s # databases such as PubMed, GenBank, GEO, and many others # Use the mandatory email parameter so the NCBI can contact you if there is a proble Entrez.email = "*****@*****.**" # Always tell NCBI who you are # logging.getLogger('regular').info('searching pubmed for the CHOP and UPENN authors') # handle = Entrez.esearch(db="pubmed", retmax=50000, idtype="esearch", mindate="2014/01/01", maxdate="2017/05/01", # term="Perelman School of Medicine[Affiliation] OR Children's Hospital of " # "Philadelphia[Affiliation] OR University of Pennsylvania School of " # "Medicine[Affiliation] OR School of Medicine University of Pennsylvania[Affiliation]", # usehistory="y") # search_results = Entrez.read(handle) # handle.close() # # obtaining the list of relevant PMIDs # id_list = search_results["IdList"] # # # get all the record based on the PMIDs # logging.getLogger('regular').info('getting relevant authors\' records based on PMIDs') # fetch_records_handle = Entrez.efetch(db="pubmed", id=id_list, retmode="text", rettype="medline") # # need to read all the data from the handle and store in a file because if we just read line by line from the # # generator and the internet connection is not strong, then we run into http errors: # # http.client.IncompleteRead: IncompleteRead(0 bytes read) # logging.getLogger('regular').info('storing authors\' records on local file') # with open("results.xml", "w") as out_handle: # out_handle.write(fetch_records_handle.read(validate=True)) # # the results are now in the results.xml file and the original handle has had all of its data extracted # # (so we close it) # fetch_records_handle.close() records_handle = open("results.xml") logging.getLogger('regular').info('creating parser record handle') # Use the Bio.Medline module to parse records fetch_records = Medline.parse(records_handle) # contains all the metadata elements on the author level: Pubmed unique Identifier number(PMID), AuthorID (as a # combination of the author’s last name, first name, and initials), institution: chop=0, Penn=1, Role: Chief Author # (CA) Ordinary Author (OA) or Principal Author (PA) and the author's affiliation author_record_df = pd.DataFrame( columns=['PMID', 'AuthorID', 'CHOP_PENN', 'ROLE', 'Affiliation']) # contains all the metadata elements on the paper level: Pubmed unique Identifier number(PMID), Title, Abstract, # Year, Month, AuthorList, SubjectList, date paper_record_df = pd.DataFrame(columns=[ 'PMID', 'Title', 'Abstract', 'Year', 'Month', 'AuthorList', 'SubjectList', 'Date' ]) # contains all the metadata of the medical information: Pubmed unique Identifier number(PMID), Primary Medical # Subject Header (MESH) and the description ID medical_record_df = pd.DataFrame(columns=['PMID', 'MESH', 'Description']) # get the description, related to the MESH, in the 2017MeshTree.csv File mesh_tree_file_object = open( r'C:\Users\GUERRAMARJ\PycharmProjects\Pubmed\template\2017MeshTree.csv' ) file_reader = csv.reader(mesh_tree_file_object, delimiter=',') mesh_description_dict = dict() logging.getLogger('regular').info( 'processing each record and obtaining relevant information') for line in file_reader: # split_line[0] = Number, split_line[1] = Description and split_line[2] = MESH mesh_description_dict[line[2]] = line[1] mesh_tree_file_object.close() # get the relevant information for each record for record_index, record in enumerate(fetch_records): logging.getLogger('regular').debug( 'record index = {0}'.format(record_index)) # initialize all the variables pmid = '' title = '' abstract = '' affiliation = '' author_id = '' role = '' mesh_term = '' try: pmid = record.get('PMID') title = record.get('TI') abstract = record.get('AB') logging.getLogger('regular').debug('pmid = {0}'.format(pmid)) logging.getLogger('regular').debug('title = {0}'.format(title)) logging.getLogger('regular').debug( 'abstract = {0}'.format(abstract)) # only used for debugging publication_type = record.get('PT') logging.getLogger('regular').debug( 'publication type = {0}'.format(publication_type)) # Note: Currently the record.get("AD") method returns a string regardless of the number of authors i.e. if # there are two author, it will return as a string both affiliations. As of result, this script has to # manually get the author information and their respective affiliations fetch_records_handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml") manual_record = Entrez.read(fetch_records_handle) try: if 'Book Chapter' in publication_type: authors_list = manual_record['PubmedBookArticle'][0][ 'BookDocument']['AuthorList'] else: # author_list for Pudmed Article authors_list = manual_record['PubmedArticle'][0][ 'MedlineCitation']['Article']['AuthorList'] except: logging.getLogger('regular').debug( 'error while obtaining the authors\' list') continue for author_index, author in enumerate(authors_list): try: affiliation = author['AffiliationInfo'][0]['Affiliation'] author_id = author['LastName'] + ', ' + author[ 'ForeName'] + ', ' + author['Initials'] logging.getLogger('regular').debug( 'affiliation = {0}'.format(affiliation)) logging.getLogger('regular').debug( 'author id = {0}'.format(author_id)) # Assign the author organization # 1 = chop, 0 = penn chop_penn = None if 'children' in affiliation.lower(): chop_penn = 1 elif 'perelman' in affiliation.lower() or 'school of medicine' in affiliation.lower() or \ 'pennsylvania' in affiliation.lower(): chop_penn = 0 logging.getLogger('regular').debug( 'chop_penn = {0}'.format(chop_penn)) # Assign the author's rle # if less than 2 authors then they are considered "Chief Authors" if author_index <= 1: role = 'CA' # If a person is after the first two authors and it'snt the last author its considered # "Ordinary Author" elif author_index > 1 and author_index != len( authors_list): role = 'OA' # else "Principal Investigator) elif author_index == len(authors_list): role = 'PI' else: ValueError('Wrong author role specified') logging.getLogger('regular').debug( 'role = {0}'.format(role)) if chop_penn is not None: # insert the author information into the dataframe for later processing author_record_df.loc[record_index] = [ pmid, author_id, chop_penn, role, affiliation ] except (IndexError, KeyError): # sometimes there wil be organizations on the authors list, in those cases, skip it continue # Medical Subject Headings (MESH) # this can be a list mesh_term = record.get("MH") logging.getLogger('regular').debug( 'mesh term = {0}'.format(mesh_term)) if mesh_term is not None: # fetch the description from the description obtain from the 2017MeshTree file if len(mesh_term) > 1: # because there are mesh_term that are not part of the 2017MeshTree, we have to loop through all # of the mesh_term until one works i.e. the first one found in the 2017MeshTree for mesh in mesh_term: try: term = mesh print('term = {0}'.format(term)) # cleaning string if '/' in term: term = term.split('/')[0] if '*' in term: term = term.replace('*', '') logging.getLogger('regular').debug( 'term = {0}'.format(term)) description = mesh_description_dict[term] except KeyError: logging.getLogger('regular').debug( 'not found term = {0}'.format(term)) continue # insert the values in the dataframe medical_record_df.append([pmid, mesh_term, description]) # insert the paper information in the paper record dataframe # paper_record_df.append([pmid, title, abstract, year, month, authors_list, subject_list, date) except ValueError as error_message: msg = 'Problem while processing the following ' print(msg) print('error message = {0}'.format(error_message)) logging.getLogger('line.regular.time.line').info( 'Recommendation System script finished running successfully.')
def main(): parser = argparse.ArgumentParser(description='Grasp Assertiveness Script') parser.add_argument('-d', '--directory', help='dataset directory', required=True) parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], help="Set the logging level") args = parser.parse_args() # check directory exists if not os.path.isdir(args.directory): msg = 'Directory = {0} not found.'.format(args.directory) raise IOError(msg) else: working_dir = args.directory logger_initialization(logger_dir=working_dir, parser=parser) logging.getLogger('time.info').info( 'Running the Grasp Assertiveness Script') try: dataset_dir = os.path.join(working_dir, 'dataset.csv') dataset = pd.read_csv(dataset_dir) except IOError: msg = 'Could not find \'dataset.csv\' in directory {0}'.format( working_dir) logging.getLogger('info').error(msg) raise IOError(msg) # index the dataset based on the trial index indexed_dataset = dataset.set_index(keys=['trial_index']) random_state = 7 testing_size = 0.10 logging.getLogger('info').info('Obtaining training and testing datasets') x_train, x_test, y_train, y_test, train_indx_list, test_indx_list, train_cl_indx, test_cl_indx = train_test_split( indexed_dataset, test_size=testing_size, random_state=random_state) msg = 'training and testing parameters:\n\t\ttesting size = {0}, random state = {1}'.format( testing_size, random_state) logging.getLogger('tab.info').info(msg) # converting the index to int (to remove decimal) and the convert them to string to be able to print them all x_train_index = [str(int(indx)) for indx in set(x_train.index)] x_test_index = [str(int(indx)) for indx in set(x_test.index)] training_indices = ','.join(x_train_index) testing_indices = ','.join(x_test_index) msg = 'training indices:\n\t\t' + training_indices + '\n\ttesting indices:\n\t\t' + testing_indices logging.getLogger('tab.info.line').info(msg) training_dataset = x_train.values[train_cl_indx, :] training_labels = y_train.values[train_cl_indx] testing_dataset = x_test.values[test_cl_indx, :] testing_labels = y_test.values[test_cl_indx] logging.getLogger('info').info('Running SVM') # training and testing on time-independent dataset clf = svm.SVC() clf.fit(training_dataset, training_labels) svm_score = clf.score(testing_dataset, testing_labels) msg = 'SVM score = {0}'.format(svm_score) logging.getLogger('tab.info').info(msg) logging.getLogger('info').info('Running HMM') n_pos_components = [2, 5, 7, 15] cov_types = ['diag', 'tied', 'spherical'] n_iterations = [5, 10, 20, 50] for nc in n_pos_components: for cov in cov_types: for _iter in n_iterations: try: msg = 'running HMM with the following parameters' logging.getLogger('time.info').info(msg) msg = 'number of states = {0}, type of covariance = {1}, number of iterations = {2}'.format( nc, cov, _iter) logging.getLogger('tab.info').info(msg) # training and testing on time-dependent dataset hmm_model = hmm.GaussianHMM(n_components=nc, random_state=random_state, covariance_type=cov, n_iter=_iter) hmm_model.fit(x_train, lengths=train_indx_list) # training hmm and logistic regression hmm_training_predictions = hmm_model.predict( x_train, lengths=train_indx_list) hmm_training_predictions_reshaped, labels_processed = reshape_predictions( predictions=hmm_training_predictions, labels=y_train) msg = 'running Logistic Regression' logging.getLogger('tab.time.info').info(msg) # mapping hmm labels to true labels logistic_regression_model = LogisticRegression() logistic_regression_model.fit( X=hmm_training_predictions_reshaped, y=labels_processed) # predictions on testing dataset hmm_testing_predictions = hmm_model.predict( x_test, lengths=test_indx_list) hmm_testing_prediction_reshaped, testing_labels_processed = reshape_predictions( predictions=hmm_testing_predictions, labels=y_test) time_score = logistic_regression_model.score( X=hmm_testing_prediction_reshaped, y=testing_labels_processed) msg = 'HMM-Logistic Regression score = {0}'.format( time_score) logging.getLogger('tab.time.info').info(msg) except ValueError as error_message: msg = 'Error while processing the following parameters ' \ '\n\t\tnumber of states = {0}, type of covariance = {1}, number of iterations = {2}'.format( nc, cov, _iter) logging.getLogger('tab.info').error(msg) msg = 'error message = {0}'.format(error_message) logging.getLogger('tab.tab.info').error(msg) pass msg = 'finished running HMM' logging.getLogger('time.info').info(msg) logging.getLogger('time.info').info( 'Finished running the Grasp Assertiveness Script')
""" end program return : exit 0 """ msg = 'Program terminated.\n' logging.getLogger('').info(msg) exit(0) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-l", "--log", dest="logLevel", choices=['DEBUG', 'INFO', 'ERROR'], help="Set the logging level") logger_initialization(parser=parser) selected_option = -1 while selected_option != 9: # options print '' print 'Program Menu:\n' print '1: Gaussian-HMM program' print '2: GMM-HMM program' print '3: Check Matlab files' print '4: Convert Matlab files to hdf5 format file' print '5: Process matlab files with basic features' print '6: Move Matlab files from Dropbox to Working Directory' print '7: Perform Logistic Regression' print '8: Perform LSTM'