def parse_genofile(config, config_dic):
    genofile = open(config_dic['genofile'], 'r')
    meta_dic = {}
    for line in genofile:
        line = line.strip()
        if len(line) == 0:
            continue
        if line.startswith('#'):
            continue
        if line.startswith('@'):
            line = line.strip('@')
            items = line.split(';')
            for item in items:
                kv = re.split(':|=', item)
                meta_dic[kv[0].strip()] = kv[1].strip()
            continue
        if line.lower().startswith("chr"):
            #
            print("geno file meta dictionary:")
            for k, v in meta_dic.items():
                print("\t%s: %s" % (k, v))
            #
            print("geno file head:\n\t%s" % line)
            strainnames = line.split()[4:]
            config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes")
            continue
        # geno file line, marker
        marker_dic = parse_marker(line)
        marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic)
        if check_genoxref(config_dic, marker_dic):
            continue
        insert_genodata(config, config_dic, marker_dic)
        insert_genoxref(config_dic, marker_dic)
        config_dic['dataid'] += 1
    genofile.close()
Example #2
0
def parse_genofile(config, config_dic):
    genofile = open(config_dic['genofile'], 'r')
    meta_dic = {}
    for line in genofile:
        line = line.strip()
        if len(line) == 0:
            continue
        if line.startswith('#'):
            continue
        if line.startswith('@'):
            line = line.strip('@')
            items = line.split(';')
            for item in items:
                kv = re.split(':|=', item)
                meta_dic[kv[0].strip()] = kv[1].strip()
            continue
        if line.lower().startswith("chr"):
            #
            print("geno file meta dictionary:")
            for k, v in meta_dic.items():
                print("\t%s: %s" % (k, v))
            #
            print("geno file head:\n\t%s" % line)
            strainnames = line.split()[4:]
            config_dic['strains'] = datastructure.get_strains_bynames(inbredsetid=config_dic['inbredsetid'], strainnames=strainnames, updatestrainxref="yes")
            continue
        # geno file line, marker
        marker_dic = parse_marker(line)
        marker_dic['genoid'] = check_or_insert_geno(config_dic, marker_dic)
        if check_genoxref(config_dic, marker_dic):
            continue
        insert_genodata(config, config_dic, marker_dic)
        insert_genoxref(config_dic, marker_dic)
        config_dic['dataid'] += 1
    genofile.close()
Example #3
0
def main(argv):
    # config
    config = utilities.get_config(argv[1])
    print "config:"
    for item in config.items('config'):
        print "\t%s" % (str(item))
    # var
    inbredsetid = config.get('config', 'inbredsetid')
    print "inbredsetid: %s" % inbredsetid
    species = datastructure.get_species(inbredsetid)
    speciesid = species[0]
    print "speciesid: %s" % speciesid
    # datafile
    datafile = open(config.get('config', 'datafile'), 'r')
    data = csv.reader(datafile, delimiter ="\t", quotechar='"')
    strainnames = data.next()
    print "strainnames: %s\n\t%s" % (len(strainnames), strainnames)
    strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes")
    print("strain: %s\n\t%s" % (len(strains), strains))
    datafile.close()
def parse_genofile(params):
    # genofile
    with open(params['genofile']) as genofile:
        meta_data = {}
        print()
        # parse genofile
        for line in genofile:
            line = line.strip()
            if not line:
                pass
            elif line.startswith('#'):
                pass
            elif line.startswith('@'):
                line = line.strip('@')
                for item in line.split(';'):
                    kv = re.split(':|=', item)
                    meta_data[kv[0].strip()] = kv[1].strip()
            
            elif line.lower().startswith("chr"):
                print("geno file meta:")
                for key, value in meta_data.iteritems():
                    print("\t{}: {}".format(key, value))
                print("geno file head:\n\t{}\n".format(line))
                strain_names = line.split()[4:]
                strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid,
                                                            strain_names=strain_names,
                                                            updatestrainxref="yes")
               
            else:
                # geno file line
                marker = Marker(line)
                #
                geno_id = check_or_insert_geno(params, marker)
    
                if check_genoxref(params): #Check if this can go earlier
                    continue
                
                insert_genodata(params)
                insert_genoxref(params)
                data_id += 1
def main(argv):
    # config
    config = utilities.get_config(argv[1])
    print("config:")
    for item in config.items('config'):
        print(("\t%s" % (str(item))))
    # var
    inbredsetid = config.get('config', 'inbredsetid')
    print(("inbredsetid: %s" % inbredsetid))
    species = datastructure.get_species(inbredsetid)
    speciesid = species[0]
    print(("speciesid: %s" % speciesid))
    dataid = datastructure.get_nextdataid_phenotype()
    print(("next data id: %s" % dataid))
    cursor, con = utilities.get_cursor()
    # datafile
    datafile = open(config.get('config', 'datafile'), 'r')
    phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"')
    phenotypedata_head = next(phenotypedata)
    print(("phenotypedata head:\n\t%s" % phenotypedata_head))
    strainnames = phenotypedata_head[1:]
    strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes")
    # metafile
    metafile = open(config.get('config', 'metafile'), 'r')
    phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"')
    phenotypemeta_head = next(phenotypemeta)
    print(("phenotypemeta head:\n\t%s" % phenotypemeta_head))
    print()
    # load
    for metarow in phenotypemeta:
        #
        datarow_value = next(phenotypedata)
        datarow_se = next(phenotypedata)
        datarow_n = next(phenotypedata)
        # Phenotype
        sql = """
            INSERT INTO Phenotype
            SET
            Phenotype.`Pre_publication_description`=%s,
            Phenotype.`Post_publication_description`=%s,
            Phenotype.`Original_description`=%s,
            Phenotype.`Pre_publication_abbreviation`=%s,
            Phenotype.`Post_publication_abbreviation`=%s,
            Phenotype.`Lab_code`=%s,
            Phenotype.`Submitter`=%s,
            Phenotype.`Owner`=%s,
            Phenotype.`Authorized_Users`=%s,
            Phenotype.`Units`=%s
            """
        cursor.execute(sql, (
            utilities.to_db_string(metarow[1], None),
            utilities.to_db_string(metarow[2], None),
            utilities.to_db_string(metarow[3], None),
            utilities.to_db_string(metarow[4], None),
            utilities.to_db_string(metarow[5], None),
            utilities.to_db_string(metarow[6], None),
            utilities.to_db_string(metarow[7], None),
            utilities.to_db_string(metarow[8], None),
            utilities.to_db_string(metarow[9], ""),
            utilities.to_db_string(metarow[18], ""),
            ))
        rowcount = cursor.rowcount
        phenotypeid = con.insert_id()
        print(("INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid)))
        # Publication
        publicationid = None # reset
        pubmed_id = utilities.to_db_string(metarow[0], None)
        if pubmed_id:
            sql = """
                SELECT Publication.`Id`
                FROM Publication
                WHERE Publication.`PubMed_ID`=%s
                """
            cursor.execute(sql, (pubmed_id))
            re = cursor.fetchone()
            if re:
                publicationid = re[0]
                print(("get Publication record: %d" % publicationid))
        if not publicationid:
            sql = """
                INSERT INTO Publication
                SET
                Publication.`PubMed_ID`=%s,
                Publication.`Abstract`=%s,
                Publication.`Authors`=%s,
                Publication.`Title`=%s,
                Publication.`Journal`=%s,
                Publication.`Volume`=%s,
                Publication.`Pages`=%s,
                Publication.`Month`=%s,
                Publication.`Year`=%s
                """
            cursor.execute(sql, (
                utilities.to_db_string(metarow[0], None),
                utilities.to_db_string(metarow[12], None),
                utilities.to_db_string(metarow[10], ""),
                utilities.to_db_string(metarow[11], None),
                utilities.to_db_string(metarow[13], None),
                utilities.to_db_string(metarow[14], None),
                utilities.to_db_string(metarow[15], None),
                utilities.to_db_string(metarow[16], None),
                utilities.to_db_string(metarow[17], ""),
                ))
            rowcount = cursor.rowcount
            publicationid = con.insert_id()
            print(("INSERT INTO Publication: %d record: %d" % (rowcount, publicationid)))
        # data
        for index, strain in enumerate(strains):
            #
            strainid = strain[0]
            value   = utilities.to_db_float(datarow_value[index+1], None)
            se      = utilities.to_db_float(datarow_se[index+1], None)
            n       = utilities.to_db_int(datarow_n[index+1], None)
            #
            if value is not None:
                sql = """
                    INSERT INTO PublishData
                    SET
                    PublishData.`Id`=%s,
                    PublishData.`StrainId`=%s,
                    PublishData.`value`=%s
                    """
                cursor.execute(sql, (dataid, strainid, value))
            if se is not None:
                sql = """
                    INSERT INTO PublishSE
                    SET
                    PublishSE.`DataId`=%s,
                    PublishSE.`StrainId`=%s,
                    PublishSE.`error`=%s
                    """
                cursor.execute(sql, (dataid, strainid, se))
            if n is not None:
                sql = """
                    INSERT INTO NStrain
                    SET
                    NStrain.`DataId`=%s,
                    NStrain.`StrainId`=%s,
                    NStrain.`count`=%s
                    """
                cursor.execute(sql, (dataid, strainid, n))
        # PublishXRef
        sql = """
            INSERT INTO PublishXRef
            SET
            PublishXRef.`InbredSetId`=%s,
            PublishXRef.`PhenotypeId`=%s,
            PublishXRef.`PublicationId`=%s,
            PublishXRef.`DataId`=%s,
            PublishXRef.`comments`=%s
            """
        cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, ""))
        rowcount = cursor.rowcount
        publishxrefid = con.insert_id()
        print(("INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid)))
        # for loop next
        dataid += 1
        print()
    # release
    con.close()
def main(argv):
    # config
    config = utilities.get_config(argv[1])
    print "config:"
    for item in config.items('config'):
        print "\t%s" % (str(item))
    # var
    inbredsetid = config.get('config', 'inbredsetid')
    print "inbredsetid: %s" % inbredsetid
    species = datastructure.get_species(inbredsetid)
    speciesid = species[0]
    print "speciesid: %s" % speciesid
    dataid = datastructure.get_nextdataid_phenotype()
    print "next data id: %s" % dataid
    cursor, con = utilities.get_cursor()
    # datafile
    datafile = open(config.get('config', 'datafile'), 'r')
    phenotypedata = csv.reader(datafile, delimiter='\t', quotechar='"')
    phenotypedata_head = phenotypedata.next()
    print("phenotypedata head [%d]:\n\t%s" % (len(phenotypedata_head), phenotypedata_head))
    strainnames = phenotypedata_head[1:]
    strains = datastructure.get_strains_bynames(inbredsetid=inbredsetid, strainnames=strainnames, updatestrainxref="yes")
    # metafile
    metafile = open(config.get('config', 'metafile'), 'r')
    phenotypemeta = csv.reader(metafile, delimiter='\t', quotechar='"')
    phenotypemeta_head = phenotypemeta.next()
    print("phenotypemeta head [%d]:\n\t%s" % (len(phenotypemeta_head), phenotypemeta_head))
    print
    # load
    for metarow in phenotypemeta:
        #
        datarow_value = phenotypedata.next()
        datarow_se = phenotypedata.next()
        datarow_n = phenotypedata.next()
        # Phenotype
        sql = """
            INSERT INTO Phenotype
            SET
            Phenotype.`Pre_publication_description`=%s,
            Phenotype.`Post_publication_description`=%s,
            Phenotype.`Original_description`=%s,
            Phenotype.`Pre_publication_abbreviation`=%s,
            Phenotype.`Post_publication_abbreviation`=%s,
            Phenotype.`Lab_code`=%s,
            Phenotype.`Submitter`=%s,
            Phenotype.`Owner`=%s,
            Phenotype.`Authorized_Users`=%s,
            Phenotype.`Units`=%s
            """
        cursor.execute(sql, (
            utilities.to_db_string(metarow[1], None),
            utilities.to_db_string(metarow[2], None),
            utilities.to_db_string(metarow[3], None),
            utilities.to_db_string(metarow[4], None),
            utilities.to_db_string(metarow[5], None),
            utilities.to_db_string(metarow[6], None),
            utilities.to_db_string(metarow[7], None),
            utilities.to_db_string(metarow[8], None),
            utilities.to_db_string(metarow[9], ""),
            utilities.to_db_string(metarow[18], ""),
            ))
        rowcount = cursor.rowcount
        phenotypeid = con.insert_id()
        print "INSERT INTO Phenotype: %d record: %d" % (rowcount, phenotypeid)
        # Publication
        publicationid = None # reset
        pubmed_id = utilities.to_db_string(metarow[0], None)
        if pubmed_id:
            sql = """
                SELECT Publication.`Id`
                FROM Publication
                WHERE Publication.`PubMed_ID`=%s
                """
            cursor.execute(sql, (pubmed_id))
            re = cursor.fetchone()
            if re:
                publicationid = re[0]
                print "get Publication record: %d" % publicationid
        if not publicationid:
            sql = """
                INSERT INTO Publication
                SET
                Publication.`PubMed_ID`=%s,
                Publication.`Abstract`=%s,
                Publication.`Authors`=%s,
                Publication.`Title`=%s,
                Publication.`Journal`=%s,
                Publication.`Volume`=%s,
                Publication.`Pages`=%s,
                Publication.`Month`=%s,
                Publication.`Year`=%s
                """
            cursor.execute(sql, (
                utilities.to_db_string(metarow[0], None),
                utilities.to_db_string(metarow[12], None),
                utilities.to_db_string(metarow[10], ""),
                utilities.to_db_string(metarow[11], None),
                utilities.to_db_string(metarow[13], None),
                utilities.to_db_string(metarow[14], None),
                utilities.to_db_string(metarow[15], None),
                utilities.to_db_string(metarow[16], None),
                utilities.to_db_string(metarow[17], ""),
                ))
            rowcount = cursor.rowcount
            publicationid = con.insert_id()
            print "INSERT INTO Publication: %d record: %d" % (rowcount, publicationid)
        # data
        for index, strain in enumerate(strains):
            #
            strainid = strain[0]
            value   = utilities.to_db_float(datarow_value[index+1], None)
            se      = utilities.to_db_float(datarow_se[index+1], None)
            n       = utilities.to_db_int(datarow_n[index+1], None)
            #
            if value is not None:
                sql = """
                    INSERT INTO PublishData
                    SET
                    PublishData.`Id`=%s,
                    PublishData.`StrainId`=%s,
                    PublishData.`value`=%s
                    """
                cursor.execute(sql, (dataid, strainid, value))
            if se is not None:
                sql = """
                    INSERT INTO PublishSE
                    SET
                    PublishSE.`DataId`=%s,
                    PublishSE.`StrainId`=%s,
                    PublishSE.`error`=%s
                    """
                cursor.execute(sql, (dataid, strainid, se))
            if n is not None:
                sql = """
                    INSERT INTO NStrain
                    SET
                    NStrain.`DataId`=%s,
                    NStrain.`StrainId`=%s,
                    NStrain.`count`=%s
                    """
                cursor.execute(sql, (dataid, strainid, n))
        # PublishXRef
        sql = """
            INSERT INTO PublishXRef
            SET
            PublishXRef.`InbredSetId`=%s,
            PublishXRef.`PhenotypeId`=%s,
            PublishXRef.`PublicationId`=%s,
            PublishXRef.`DataId`=%s,
            PublishXRef.`comments`=%s
            """
        cursor.execute(sql, (inbredsetid, phenotypeid, publicationid, dataid, ""))
        rowcount = cursor.rowcount
        publishxrefid = con.insert_id()
        print "INSERT INTO PublishXRef: %d record: %d" % (rowcount, publishxrefid)
        # for loop next
        dataid += 1
        print
    # release
    con.close()