Esempio n. 1
0
def import_pedigree(f):
    test_file(f)

    # import the data
    # but some are csv and some are excel!
    if bool(re.search('.csv$', f)):
        peds = pd.read_csv(f, dtype={'Family': object}, encoding='utf-8')
    elif bool(re.search('.xlsx*$', f)):
        peds = pd.read_excel(f)
    else:
        err_out(
            "Error: Confused by pedigree file name {}, expecting *.csv or *.xlsx.\nExiting."
            .format(f), log)

    ## Each set has a sequencing duplicate that should be removed
    theDupe = find_the_duplicate(peds, "ped")

    # create a series of batch information with Subject as the index
    peds = peds[peds['Subject_ID'] != ""][[
        'Subject_ID', 'Investigator Column 3'
    ]]
    peds = peds[peds.Subject_ID.notnull()].set_index('Subject_ID')
    peds = peds['Investigator Column 3'].str.replace("_.*$", "", regex=True)

    if theDupe != None:
        peds = peds.drop(theDupe, errors='ignore')

    return (peds, theDupe)
Esempio n. 2
0
def find_the_duplicate(df, fformat):
    if fformat == "ped":
        # find the duplicate using the comments column with the word "duplicate"
        dupestr = df[df['Investigator Column 1'].str.contains(
            'Duplicate', case=False, na=False)]['Subject_ID']
        # find the duplicate because it is the only subject ID > 9 characters
        dupelen = df.loc[df['Subject_ID'].str.len() > 9]['Subject_ID']
        # If the two methods aren't the same answer, error out
        if not dupestr.equals(dupelen):
            err_out("Warning: unable to ascertain duplicate sample identifier.\n" + \
                    "{} was identified in the pedigree file as a duplicate in 'Investigator Column 1',\n" + \
                    "{} has Subject_ID length > 9 characters.\n" + \
                    "Exiting".format(dupestr, dupelen), log)

    elif fformat == "manifest":
        # find the duplicate because it is the only subject ID > 9 characters
        dupestr = df.loc[(df['Subject_ID'].str.len() > 9)
                         & (df['Subject_ID'] != "CONTROL_ID")]['Subject_ID']

    if dupestr.size > 1:
        send_update(
            "Warning: found more than one duplicate sample identifier: {}.".
            format(dupestr.tolist()), log)
    elif dupestr.size < 1:
        send_update(
            "Warning: no duplicate sample identifier was found in the {} file."
            .format(fformat), log)
        return (None)
    else:
        send_update("Duplicate sample identifier: {}".format(dupestr.tolist()),
                    log)

    return (dupestr.tolist())
Esempio n. 3
0
def send_curl(curl_string):
    proc = subprocess.Popen([curl_string], stdout=subprocess.PIPE, shell=True)
    (out, err) = proc.communicate()
    if err is not None:
        err_out(
            "Errored out attempting to establish session with BSI: .{}".format(
                err))
    return (out)
Esempio n. 4
0
def read_conf(cnf):
    if not os.path.isfile(cnf):
        # SUE ADD Log to err_out
        err_out("Error: unable to locate BSI authentication file:  " + cnf)

    with open(cnf, 'r') as f:
        x = f.readlines()

    user = x[0].rstrip()
    pw = x[1].rstrip()

    return (urllib.parse.quote(user, safe=''), urllib.parse.quote(pw, safe=''))
Esempio n. 5
0
def get_bsi_session(url, user, pw):
    curl_string = "curl -s -X POST --header 'Content-Type: application/x-www-form-urlencoded' --header 'Accept: text/plain' -d 'user_name=" + user + "&password="******"' 'https://rest.bsisystems.com/api/rest/EBMS/common/logon'"
    sessionID = send_curl(curl_string)
    #    print("Session ID: {}".format(sessionID))
    #    print(sessionID.decode("utf-8"))

    if sessionID.decode(
            "utf-8"
    ).find("Logon failed: The username, password, and database combination is incorrect"
           ) != -1:
        err_out("\n*** Error: login information is incorrect. ***\nQuitting.")

    #curl_string = "curl -s -X POST --header 'Content-Type: application/x-www-form-urlencoded' --header 'Accept: text/plain' -d 'user_name=" + user "' 'https://rest.bsisystems.com/api/rest/EBMS/common/logon' --digest"
    #sessionID = send_curl(curl_string)
    return (sessionID)
Esempio n. 6
0
def create_config(dir_info):
    #print(dir_info)
    filenames = os.listdir(dir_info)
    #print(filenames)

    config = dict()
    # Subject Sample Mapping file
    f = fnmatch.filter(os.listdir(dir_info), "*SubjectSampleMappingFile.csv")
    if len(f) < 1:
        err_out(
            'Error: unable to locate input Subject Sample Mapping file: "*SubjectSampleMappingFile.csv"\n',
            log)
    else:
        config['mapping'] = os.path.join(dir_info, f[0])

    # Master Sample Key file
    f = fnmatch.filter(os.listdir(dir_info), "*MasterSampleKey*.csv")
    if len(f) < 1:
        err_out(
            'Error: unable to locate input Master Sample Key file: "*MasterSampleKey.csv"\n',
            log)
    else:
        config['samplekey'] = os.path.join(dir_info, f[0])

    # Pedigree file
    f = fnmatch.filter(os.listdir(dir_info), "*Pedigree*.csv")
    if len(f) < 1:
        err_out(
            'Error: unable to locate input Pedigree file: "*Pedigree*.csv"\n',
            log)
    else:
        config['pedigree'] = os.path.join(dir_info, f[0])

    # test for the three files, will error out if any are missing
    foundfilestext = ""
    for i in ['mapping', 'samplekey', 'pedigree']:
        test_file(config[i])
        foundfilestext += "\t" + i + ": " + config[i] + "\n"

    foundfilestext = "Successfully located {} input files:\n{}\n".format(
        str(len(config)), foundfilestext)

    # add to the config variables
    config['rootdir'] = '/hpcdata/dir/CIDR_DATA_RENAMED'
    config['family_errors'] = 'family_errors.txt'

    # output file names
    config['tracking'] = 'sample_tracking_summary_batch' + str(batch) + '.txt'
    config['masterkey'] = 'masterkey_batch' + str(batch) + '.txt'
    config['newpedigree'] = 'seqr_ped_batch' + str(batch) + '.txt'
    config['batchinfo'] = 'genrptlinks_batch' + str(batch) + '.txt'
    config['linkscript_fname'] = 'link_bams_batch' + str(batch) + '.sh'

    return (config, foundfilestext)
Esempio n. 7
0
def test_file(f):
    if not os.path.isfile(f):
        err_out("Error: unable to locate input file:  " + f, log)
    else:
        return (True)
Esempio n. 8
0
def bsi_query(curl,
              url,
              session,
              fields,
              theIDs,
              search_field,
              isequal=True,
              islike=False):
    fields = [get_bsi_name(f) for f in fields]

    study = "&criteria=subject.study_id%3DNIAID%20Centralized%20Sequencing"
    ## order status added as per Xi Cheng email 2/20/2019
    order_status = "&criteria=sample.field_314%3D%22Specimen%20Collected%22"
    limit = '10'

    # replace spaces in the IDs with "%20"
    theIDs = [re.sub(" ", "%20", x) for x in theIDs]

    # Construct the curl command
    curl += session.decode(encoding='UTF-8') + "'"
    ## order status added as per Xi Cheng email 2/20/2019
    curl += " '" + url + "?display_fields=" + "&display_fields=".join(
        fields) + study
    #curl += " '" + url + "?display_fields=" + "&display_fields=".join(fields) + study + order_status
    curl += "&criteria=" + get_bsi_name(search_field)
    #print(curl)

    # add the "!" for not or "=@" for like
    if not isequal:
        curl += "!"
    if not islike:
        curl += "%3D" + "%3B".join(theIDs)
    else:
        curl += "%3D%40" + "%3B".join(theIDs)

    curl = curl + "&type=1'"
    #    print(curl)

    # Get the data using the curl command
    data = send_curl(curl)
    # print("DATA:\n{}".format(data))

    data = data.decode('utf-8')
    data = json.loads(data)

    #print(data)

    if 'message' in data:
        if re.search("Error running report:", data['message']):
            err_out(
                "\n*** BSI query failed to return valid results ***\nQuitting."
            )

    if len(data) == 0:
        err_out("BSI query failed to return valid results")

    # Convert the data into a dataframe
    df = pd.DataFrame(data['rows'], columns=data['headers'])
    # print("Curl results size: {}".format(df.shape))

    # Rename the columns:
    colnameDict = {
        'CRIS Order #': 'CRIS_Order#',
        'Batch Sent': 'Batch_Sent',
        'Batch Received': 'Batch_Received',
        'Batch Ready': 'Batch_Ready',
        'Phenotips Family ID': 'Phenotips_Family_ID',
        'PhenotipsId': 'Phenotips_ID',
        'Mother PhenotipsId': 'Mother_Phenotips_ID',
        'Father PhenotipsId': 'Father_Phenotips_ID',
        'Family Complete Status': 'Family_Complete_Status',
        'Affected Status': 'Affected',
        'CIDR Exome ID': 'CIDR_Exome_ID',
        'CRIS Order Status': 'CRIS_Order_Status',
        'Patient Name': 'Patient_Name',
        'Date Drawn': 'Date_Drawn',
        'Date Received': 'Date_Received',
        'GRIS Owner': 'GRIS_Owner',
        'Tissue Origin': 'Tissue',
        'Subject ID': 'MRN',
        'Order Date': 'Order_Date'
    }
    df.rename(columns=colnameDict, inplace=True)

    return (df)