def import_pedigree(f): test_file(f) # import the data # but some are csv and some are excel! if bool(re.search('.csv$', f)): peds = pd.read_csv(f, dtype={'Family': object}, encoding='utf-8') elif bool(re.search('.xlsx*$', f)): peds = pd.read_excel(f) else: err_out( "Error: Confused by pedigree file name {}, expecting *.csv or *.xlsx.\nExiting." .format(f), log) ## Each set has a sequencing duplicate that should be removed theDupe = find_the_duplicate(peds, "ped") # create a series of batch information with Subject as the index peds = peds[peds['Subject_ID'] != ""][[ 'Subject_ID', 'Investigator Column 3' ]] peds = peds[peds.Subject_ID.notnull()].set_index('Subject_ID') peds = peds['Investigator Column 3'].str.replace("_.*$", "", regex=True) if theDupe != None: peds = peds.drop(theDupe, errors='ignore') return (peds, theDupe)
def find_the_duplicate(df, fformat): if fformat == "ped": # find the duplicate using the comments column with the word "duplicate" dupestr = df[df['Investigator Column 1'].str.contains( 'Duplicate', case=False, na=False)]['Subject_ID'] # find the duplicate because it is the only subject ID > 9 characters dupelen = df.loc[df['Subject_ID'].str.len() > 9]['Subject_ID'] # If the two methods aren't the same answer, error out if not dupestr.equals(dupelen): err_out("Warning: unable to ascertain duplicate sample identifier.\n" + \ "{} was identified in the pedigree file as a duplicate in 'Investigator Column 1',\n" + \ "{} has Subject_ID length > 9 characters.\n" + \ "Exiting".format(dupestr, dupelen), log) elif fformat == "manifest": # find the duplicate because it is the only subject ID > 9 characters dupestr = df.loc[(df['Subject_ID'].str.len() > 9) & (df['Subject_ID'] != "CONTROL_ID")]['Subject_ID'] if dupestr.size > 1: send_update( "Warning: found more than one duplicate sample identifier: {}.". format(dupestr.tolist()), log) elif dupestr.size < 1: send_update( "Warning: no duplicate sample identifier was found in the {} file." .format(fformat), log) return (None) else: send_update("Duplicate sample identifier: {}".format(dupestr.tolist()), log) return (dupestr.tolist())
def send_curl(curl_string): proc = subprocess.Popen([curl_string], stdout=subprocess.PIPE, shell=True) (out, err) = proc.communicate() if err is not None: err_out( "Errored out attempting to establish session with BSI: .{}".format( err)) return (out)
def read_conf(cnf): if not os.path.isfile(cnf): # SUE ADD Log to err_out err_out("Error: unable to locate BSI authentication file: " + cnf) with open(cnf, 'r') as f: x = f.readlines() user = x[0].rstrip() pw = x[1].rstrip() return (urllib.parse.quote(user, safe=''), urllib.parse.quote(pw, safe=''))
def get_bsi_session(url, user, pw): curl_string = "curl -s -X POST --header 'Content-Type: application/x-www-form-urlencoded' --header 'Accept: text/plain' -d 'user_name=" + user + "&password="******"' 'https://rest.bsisystems.com/api/rest/EBMS/common/logon'" sessionID = send_curl(curl_string) # print("Session ID: {}".format(sessionID)) # print(sessionID.decode("utf-8")) if sessionID.decode( "utf-8" ).find("Logon failed: The username, password, and database combination is incorrect" ) != -1: err_out("\n*** Error: login information is incorrect. ***\nQuitting.") #curl_string = "curl -s -X POST --header 'Content-Type: application/x-www-form-urlencoded' --header 'Accept: text/plain' -d 'user_name=" + user "' 'https://rest.bsisystems.com/api/rest/EBMS/common/logon' --digest" #sessionID = send_curl(curl_string) return (sessionID)
def create_config(dir_info): #print(dir_info) filenames = os.listdir(dir_info) #print(filenames) config = dict() # Subject Sample Mapping file f = fnmatch.filter(os.listdir(dir_info), "*SubjectSampleMappingFile.csv") if len(f) < 1: err_out( 'Error: unable to locate input Subject Sample Mapping file: "*SubjectSampleMappingFile.csv"\n', log) else: config['mapping'] = os.path.join(dir_info, f[0]) # Master Sample Key file f = fnmatch.filter(os.listdir(dir_info), "*MasterSampleKey*.csv") if len(f) < 1: err_out( 'Error: unable to locate input Master Sample Key file: "*MasterSampleKey.csv"\n', log) else: config['samplekey'] = os.path.join(dir_info, f[0]) # Pedigree file f = fnmatch.filter(os.listdir(dir_info), "*Pedigree*.csv") if len(f) < 1: err_out( 'Error: unable to locate input Pedigree file: "*Pedigree*.csv"\n', log) else: config['pedigree'] = os.path.join(dir_info, f[0]) # test for the three files, will error out if any are missing foundfilestext = "" for i in ['mapping', 'samplekey', 'pedigree']: test_file(config[i]) foundfilestext += "\t" + i + ": " + config[i] + "\n" foundfilestext = "Successfully located {} input files:\n{}\n".format( str(len(config)), foundfilestext) # add to the config variables config['rootdir'] = '/hpcdata/dir/CIDR_DATA_RENAMED' config['family_errors'] = 'family_errors.txt' # output file names config['tracking'] = 'sample_tracking_summary_batch' + str(batch) + '.txt' config['masterkey'] = 'masterkey_batch' + str(batch) + '.txt' config['newpedigree'] = 'seqr_ped_batch' + str(batch) + '.txt' config['batchinfo'] = 'genrptlinks_batch' + str(batch) + '.txt' config['linkscript_fname'] = 'link_bams_batch' + str(batch) + '.sh' return (config, foundfilestext)
def test_file(f): if not os.path.isfile(f): err_out("Error: unable to locate input file: " + f, log) else: return (True)
def bsi_query(curl, url, session, fields, theIDs, search_field, isequal=True, islike=False): fields = [get_bsi_name(f) for f in fields] study = "&criteria=subject.study_id%3DNIAID%20Centralized%20Sequencing" ## order status added as per Xi Cheng email 2/20/2019 order_status = "&criteria=sample.field_314%3D%22Specimen%20Collected%22" limit = '10' # replace spaces in the IDs with "%20" theIDs = [re.sub(" ", "%20", x) for x in theIDs] # Construct the curl command curl += session.decode(encoding='UTF-8') + "'" ## order status added as per Xi Cheng email 2/20/2019 curl += " '" + url + "?display_fields=" + "&display_fields=".join( fields) + study #curl += " '" + url + "?display_fields=" + "&display_fields=".join(fields) + study + order_status curl += "&criteria=" + get_bsi_name(search_field) #print(curl) # add the "!" for not or "=@" for like if not isequal: curl += "!" if not islike: curl += "%3D" + "%3B".join(theIDs) else: curl += "%3D%40" + "%3B".join(theIDs) curl = curl + "&type=1'" # print(curl) # Get the data using the curl command data = send_curl(curl) # print("DATA:\n{}".format(data)) data = data.decode('utf-8') data = json.loads(data) #print(data) if 'message' in data: if re.search("Error running report:", data['message']): err_out( "\n*** BSI query failed to return valid results ***\nQuitting." ) if len(data) == 0: err_out("BSI query failed to return valid results") # Convert the data into a dataframe df = pd.DataFrame(data['rows'], columns=data['headers']) # print("Curl results size: {}".format(df.shape)) # Rename the columns: colnameDict = { 'CRIS Order #': 'CRIS_Order#', 'Batch Sent': 'Batch_Sent', 'Batch Received': 'Batch_Received', 'Batch Ready': 'Batch_Ready', 'Phenotips Family ID': 'Phenotips_Family_ID', 'PhenotipsId': 'Phenotips_ID', 'Mother PhenotipsId': 'Mother_Phenotips_ID', 'Father PhenotipsId': 'Father_Phenotips_ID', 'Family Complete Status': 'Family_Complete_Status', 'Affected Status': 'Affected', 'CIDR Exome ID': 'CIDR_Exome_ID', 'CRIS Order Status': 'CRIS_Order_Status', 'Patient Name': 'Patient_Name', 'Date Drawn': 'Date_Drawn', 'Date Received': 'Date_Received', 'GRIS Owner': 'GRIS_Owner', 'Tissue Origin': 'Tissue', 'Subject ID': 'MRN', 'Order Date': 'Order_Date' } df.rename(columns=colnameDict, inplace=True) return (df)