Ejemplo n.º 1
0
def get_geneset(filename):
    indices = [pos for pos, char in enumerate(filename) if char == '.']
    outfile = '/Volumes/My Book/AHS_projectdata/geneSets2/' + filename[:indices[
        -2]] + '.geneset.csv'
    print('running ', filename[:indices[1]])
    genes = pd.read_csv(filename, sep='\t')
    if genes.shape[0] == 0:
        genes.to_csv(outfile, sep='\t')
        return
    dataset = Dataset(name='hsapiens_gene_ensembl', host='grch37.ensembl.org')
    df = pd.concat([
        dataset.query(
            attributes=['ensembl_gene_id', 'hgnc_symbol'],
            filters={
                'link_ensembl_gene_id':
                genes.gene_ID.tolist()[:int(len(genes.gene_ID.tolist()) / 2)]
            }),
        dataset.query(
            attributes=['ensembl_gene_id', 'hgnc_symbol'],
            filters={
                'link_ensembl_gene_id':
                genes.gene_ID.tolist()[int(len(genes.gene_ID.tolist()) / 2):]
            })
    ],
                   sort=False)
    df.drop_duplicates(subset=["HGNC symbol"], inplace=True)
    df.dropna(inplace=True)
    my_genes = df["HGNC symbol"]
    my_genes.to_csv(outfile, index=False, header=False)
Ejemplo n.º 2
0
def get_ensembl_table():

    dataset = Dataset(name='hsapiens_gene_ensembl',
                      host='http://www.ensembl.org')

    table = dataset.query(
        attributes=['ensembl_gene_id', 'external_gene_name', 'unigene'])
    return table
Ejemplo n.º 3
0
    def test_ensembl(self):
        """Tests example query to ensembl."""

        dataset = Dataset(name='hsapiens_gene_ensembl',
                          host='http://www.ensembl.org',
                          use_cache=False)

        result = dataset.query(
            attributes=['ensembl_gene_id', 'external_gene_name'])

        assert result.shape[0] > 0
        assert result.shape[1] == 2
Ejemplo n.º 4
0
def gene_length_normalize(*,
                          genes_info,
                          genes_col='HGNC symbol',
                          length_col='gene_length',
                          scores_df,
                          samples_col):
    """
    Normalize dataset by gene length. if gene lengths file is not provided, info will be retrieved from ensembl.

    Parameters
    ----------
    genes_info : str
        file containing gene lengths. If file is not provided, info will be retrieved from ensembl
    genes_col : str
        column containing gene names.
    length_col : str
        column containing the length of each gene.
    scores_df : pd.DataFrame
        dataframe containing data to normalize.
    samples_col : str
        column containing samples IDs.

    Returns
    -------
    pd.Dataframe
        dataframe containing normalized dataframe.

    """
    unnormalized = []
    if not genes_info:
        dataset = Dataset(name='hsapiens_gene_ensembl',
                          host='http://www.ensembl.org')
        genes_df = dataset.query(
            attributes=['hgnc_symbol', 'start_position', 'end_position'])
        genes_df['gene_length'] = genes_df['Gene end (bp)'] - genes_df[
            'Gene start (bp)']
    else:
        genes_df = pd.read_csv(genes_info, sep='\t')
    genes_lengths = genes_df.set_index(genes_col).to_dict()[length_col]
    for (name, data) in tqdm(scores_df.drop(columns=[samples_col]).iteritems(),
                             desc="Normalizing genes scores"):
        if name not in genes_lengths.keys():
            unnormalized.append(name)
            continue
        # normalize genes by length
        scores_df[name] = round(scores_df[name] / genes_lengths[name], 5)
    scores_df = scores_df.drop(unnormalized, axis=1)
    return scores_df
Ejemplo n.º 5
0
def get_homology_lookup() -> pd.DataFrame:
    """
    Returns lookup table consisting of ensembl id of reference species (C.elegans)
        and ensembl id, gene symbol, orthology type and orthology confidence
        of the other species (D.melanogaster).
    """
    
    dataset = Dataset(name=CELEGANS_DATASET_NAME,
                     host=HOST)
    
    attributes = [ENSEMBL_ID_ATTRIBUTE] + DROSO_HOMO_ATTRIBUTES
    df_lookup = dataset.query(attributes=attributes,
                    filters=None)

    df_lookup.to_csv(LOOKUP_FILENAME, header=True, index=True)
    
    return df_lookup
def get_ref_proteins(gene_list):
    """Get wild type protein sequences for each gene with an alternative junction

    Args:
        gene_list (list): list of gene symbols

    Returns:
        final_gene_df (df): reference protein df with sequence, length, and ID
    """
    dataset = Dataset(name='hsapiens_gene_ensembl',
                      host='http://www.ensembl.org')
    total_gene_df = pd.DataFrame()
    gene_info = dataset.query(attributes=[
        "external_gene_name", "ensembl_gene_id", "ensembl_transcript_id",
        "ensembl_peptide_id", "chromosome_name", "start_position",
        "end_position", "strand", "transcript_start", "transcript_end",
        "transcription_start_site", "transcript_length", "transcript_tsl",
        "transcript_biotype"
    ])
    for gene in gene_list:
        gene_df = gene_info.loc[gene_info["Gene name"] == gene]
        # filter out NaN values
        gene_df = gene_df.loc[gene_df["Transcript support level (TSL)"].astype(
            str).str.contains("tsl")]
        gene_df["tsl"] = [
            re.search(r'\d', x).group()
            for x in list(gene_df["Transcript support level (TSL)"])
        ]
        # filter by protein coding and TSL == 1,2
        gene_df = gene_df[(gene_df["Transcript type"] == "protein_coding")
                          & (gene_df["tsl"].isin(["1", "2"]))]
        gene_df["protein sequence"] = [
            ensembl_rest.sequence_id(x)["seq"]
            for x in list(gene_df["Protein stable ID"])
        ]
        gene_df["protein length"] = [
            len(x) for x in list(gene_df["protein sequence"])
        ]
        final_gene_df = gene_df[[
            "Protein stable ID", "protein sequence", "protein length"
        ]]
        final_gene_df["gene"] = gene
        total_gene_df = total_gene_df.append(final_gene_df, ignore_index=True)
    total_gene_df.to_csv("protein_sequences.tsv", sep='\t', index=False)
    return total_gene_df
Ejemplo n.º 7
0
def get_species_ens_entrez_lookup(dataset_name: str) -> pd.DataFrame:
    """
    Returns lookup table for a ensembl dataset name with 2 columns:
        ensembl id, entrez id.
    """
    
    dataset = Dataset(name=dataset_name,
                     host=HOST)
    
    df_lookup = dataset.query(attributes=[
        ENSEMBL_ID_ATTRIBUTE,
        ENTREZ_ID_ATTRIBUTE],
        filters=None)
    
    df_lookup.to_csv(RESULTS_DIR / ("{}_ENS_ENTREZ_LOOKUP_.csv"
                     .format(dataset_name.split("_")[0].upper())),
                     header=True, index=True)
    
    return df_lookup
Ejemplo n.º 8
0
def get_biomart(species, meta):
    tmp_host = 'http://asia.ensembl.org'
    server = Server(host=tmp_host)
    query_set = None
    try:
        dataset = Dataset(name=species, host=tmp_host)
        if meta:
            query_set = dataset.query(attributes=[
                'ensembl_gene_id', 'external_gene_name', 'description',
                'uniprotswissprot', 'kegg_enzyme', 'metacyc'
            ])
        else:
            query_set = dataset.query(attributes=[
                'ensembl_gene_id', 'external_gene_name', 'description',
                'uniprotswissprot', 'kegg_enzyme'
            ])
    except IndexError:
        mart = server['ENSEMBL_MART_ENSEMBL']
        print('Invalid dataset in BioMart')
        print(mart.list_datasets())
    return query_set
def biomart(ani_list,ani_dict,out,mode):
    #server = Server(host='http://www.ensembl.org')
    #mart = server['ENSEMBL_MART_ENSEMBL']
    #all_name = mart.list_datasets()
    for animal in ani_list:    
        dataset = Dataset(name= ani_dict[animal],host='http://www.ensembl.org')
        #dataset.list_filters
        #attr_all_list = dataset.attributes()
        if mode == "GO":
            print("Downloading "  + animal + " Gene information")
            if not os.path.exists(out + "/" + "GO"):
                os.mkdir(out + "/" + "GO")
            attr_list = ["ensembl_gene_id","external_gene_name","start_position","end_position","description","transcript_count","chromosome_name"]
            df = dataset.query(attributes= attr_list)
            df.to_csv(out + "/" + "GO" + "/" + animal + "_GO.txt", index = None, header = True)

        elif mode == "GOD":
            print("Downloading " + animal + " Gene Ontology")
            if not os.path.exists(out + "/" + "GOD"):
                os.mkdir(out + "/" + "GOD")
            attr_list = ["ensembl_gene_id","go_id","name_1006","definition_1006"]
            df = dataset.query(attributes= attr_list)
            df.to_csv(out + "/" + "GOD" + "/" + animal + "_GOD.txt",sep='\t', index = None, header = True)
        elif mode == "ORTH":
            print("Downloading " + animal + " Orthologs")
            if not os.path.exists(out + "/" + "ORTH"):
                os.mkdir(out + "/" + "ORTH")
            orth_list = list(ani_dict.keys())
            for o in orth_list:
                if not os.path.exists(out + "/" + "ORTH" + "/" + o):
                    os.mkdir(out + "/" + "ORTH" + "/" + o)
            orth_list.remove(animal)
            sp_list = list()
            for key in orth_list:
                sp_name = ani_dict[key].split("_")[0]
                sp_list.append(sp_name)
            for sp in sp_list:
                attr_list = ["ensembl_gene_id","external_gene_name",sp + "_homolog_ensembl_gene",sp + "_homolog_associated_gene_name",sp + "_homolog_orthology_type"]
                df = dataset.query(attributes= attr_list)
                f = list(animal_name_dict.keys())[list(animal_name_dict.values()).index(sp + "_gene_ensembl")]
                print("Downloading orthologs between " + animal + " and " + f)
                df.to_csv(out + "/" + "ORTH" + "/" + animal + "/" + animal + "_" + f + ".txt", index = None, header = True)             
        elif mode == "PC":
            print("Downloading protein coding genes information for " + animal)
            if not os.path.exists(out + "/" + "PC"):
                os.mkdir(out + "/" + "PC")
            attr_list = ["ensembl_gene_id","external_gene_name","go_id"]
            filter_list = {"biotype": ["protein_coding"]}
            df = dataset.query(attributes= attr_list, filters= filter_list)
            df.to_csv(out + "/" + "PC" + "/" + animal + ".txt", index = None, header = True)            
Ejemplo n.º 10
0
def biomart(ani_list, ani_dict, out, mode):
    #server = Server(host='http://www.ensembl.org')
    #mart = server['ENSEMBL_MART_ENSEMBL']
    #all_name = mart.list_datasets()
    #attr_all_list = dataset.attributes()
    if mode == "Bos_Chromosome_18":
        dataset = Dataset(name="btaurus_gene_ensembl",
                          host='http://www.ensembl.org')
        print("Downloading " + mode + " Gene information")
        if not os.path.exists(path + "temp_Data/" + "Bos_Chromosome_18/"):
            os.mkdir(path + "temp_Data/" + "Bos_Chromosome_18/")
        attr_list = ["ensembl_gene_id"]
        filter_list = {'chromosome_name': ['18']}
        df = dataset.query(attributes=attr_list, filters=filter_list)
        df.to_csv(path + "temp_Data/" + "Bos_Chromosome_18/" + "Cow_C_18.txt",
                  index=None,
                  header=True)
    if mode == "sex":
        for ani in animal_list:
            dataset = Dataset(name=ani_dict[ani],
                              host='http://www.ensembl.org')
            print("Downloading " + ani + mode + " Gene information")
            if not os.path.exists(path + "temp_Data/" + "sex/"):
                os.mkdir(path + "temp_Data/" + "sex/")
            attr_list = ["ensembl_gene_id"]
            if ani == "Chicken":
                filter_list = {'chromosome_name': ['W', "Z"]}
                df = dataset.query(attributes=attr_list, filters=filter_list)
                df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt",
                          index=None,
                          header=True)
                print(ani + "W Z")
            else:
                try:
                    filter_list = {'chromosome_name': ["X"]}
                    df = dataset.query(attributes=attr_list,
                                       filters=filter_list)
                    df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt",
                              index=None,
                              header=True)
                    print(ani + "X Y")
                except:
                    filter_list = {'chromosome_name': ["X"]}
                    df = dataset.query(attributes=attr_list,
                                       filters=filter_list)
                    df.to_csv(path + "temp_Data/" + "sex/" + ani + "_sex.txt",
                              index=None,
                              header=True)
                    print(ani + "X")
    if mode == "MT":
        for ani in animal_list:
            dataset = Dataset(name=ani_dict[ani],
                              host='http://www.ensembl.org')
            print("Downloading " + ani + mode + " Gene information")
            if not os.path.exists(path + "temp_Data/" + "MT/"):
                os.mkdir(path + "temp_Data/" + "MT/")
            attr_list = ["ensembl_gene_id"]
            try:
                filter_list = {'chromosome_name': ["MT"]}
                df = dataset.query(attributes=attr_list, filters=filter_list)
                df.to_csv(path + "temp_Data/" + "MT/" + ani + "_MT.txt",
                          index=None,
                          header=True)
            except:
                print("NO Mitochondrial genes in {}".format(ani))
Ejemplo n.º 11
0
outF = open(evalign_stat, "w")
print(
    "Position\tIsoform\tGene_ID\tReads\tEvent_mean\tEvent_median\tSD\tDistance",
    end="\n",
    file=outF)
[
    print(k.split('_')[2] + '\t' + k.split('_')[1] + '\t' + k.split('_')[0] +
          '\t' + str(len(v)) + '\t' + str(np.mean(v)) + '\t' +
          str(np.median(v)) + '\t' + str(np.std(v)) + '\t' +
          str(np.mean(v) - 123.83),
          end="\n",
          file=outF) for k, v in data_dict.items()
]

evalign_file.close()
isoform_file.close()
outF.close()
outF2.close()

dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')

conversion = dataset.query(
    attributes=['ensembl_gene_id', 'external_gene_name'])
conversion.columns = ['Gene_ID', 'Gene_symbol']

for file in [evalign_stat, evalign_df]:
    df = pd.read_csv(file, sep='\t')
    df_merge = pd.merge(df, conversion, how='inner', on=['Gene_ID'])
    df_merge.to_csv(file, header=True, index=False, sep='\t')
Ejemplo n.º 12
0
def handle_upload_2(fn):
    patients = []

    patients1 = []
    patients2 = []
    genes = []
    geneNames = []
    #data = {}
    data1 = {}
    data2 = {}
    group1 = []
    group2 = []
    group_labels1 = []
    group_labels2 = []
    group1_data = []
    group2_data = []
    #patient_ids = ['3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9','6F4C8D30-47FB-47DF-9EB7-4E5881E3711E','95CEF916-5545-455B-920C-773A54FC7676','67C73260-A242-4BBA-87C5-D2302556DFF7','55262FCB-1B01-4480-	B322-36570430C917','3dbe99d1-e3b8-4ee2-b6a8-2e2e12c6fbe9','6F4C8D30-47FB-47DF-9EB7-4E5881E3711E','95CEF916-5545-455B-920C-773A54FC7676','67C73260-A242-4BBA-87C5-D2302556DFF7','55262FCB-1B01-4480-	B322-36570430C917']
    #patientfilename = 'nationwidechildrens.org_clinical_patient_brca.txt'

    patients.append([1, 2, 3, 4])
    patients1.append(['1', '2'])
    #patients.append(['3','4'])
    patients2.append(['3', '4'])
    group_labels1.append([1, 1])
    group_labels2.append([2, 2])
    logstring = "Creating Plots for given input files... \n\n"
    logstring = "Reading gene expression data... \n"
    line_no = 0
    patient_ids = []
    survival = []
    survival_yrs = []
    data = []
    #group2.append(group_labels2)
    group1.append([1, 1])
    group2.append([2, 2])
    genes = ()
    dataset = Dataset(name='hsapiens_gene_ensembl',
                      host='http://www.ensembl.org')
    conv = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'])
    genes_3 = {}
    logstring = logstring + "\n\n Matching gene and protein IDs... \n"

    red_patch = mpatches.Patch(color='red', label='Group1')
    blue_patch = mpatches.Patch(color='blue', label='Group2')
    #lut = dict(zip(set(endData[0]), sns.hls_palette(len(set(endData[0])), l=0.5, s=0.8)))
    #col_colors = pd.DataFrame(endData[0])[0].map(lut)
    #print(col_colors)
    #colors = np.array(['#BB0000','#BB0000','#0000BB','#0000BB'])
    #df9 = pd.DataFrame(data=endData[1:,0:],index=geneNames,columns=patients)
    #df2 = pd.DataFrame(data=endData[0,0:], index='',columns=patients)
    #my_palette = dict(zip(df[.unique(), ["orange","yellow","brown"]))
    #row_colors = df2.cyl.map(my_palette)
    #fig, (ax1, ax2) = plt.subplots(1,2,sharex=True,sharey=True)
    colordict = {0: '#BB0000', 1: '#0000BB'}
    #logstring = logstring + str(df9)
    df2 = pd.read_csv(fn, delim_whitespace=True, header=None, index_col=0)
    #print(df2.head())
    df = df2.transpose()
    survival_real = df['SURVIVAL']
    #df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
    #train, test = df[df['is_train']==True], df[df['is_train']==False]
    features = df.columns[3:60483]
    x = df.iloc[:, 5:500]
    y = df.iloc[:, 2]
    print(df)
    rm = linear_model.LinearRegression()
    rm.fit(x, y)
    #print(rm.intercept_)
    #print(rm.coef_)
    #print(rm.predict(x))
    predictions = rm.predict(x)
    real_values = df.iloc[:, 2].values.tolist()
    ret = []
    for j in range(1, len(survival_real)):
        accur = "FALSE"
        if (abs(float(predictions[j]) - float(real_values[j])) < 1.0):
            print("Foo")
            accur = "TRUE"
        ret.append({
            'patient_id': j,
            'real_value': real_values[j],
            'prediction': predictions[j],
            'was_correct': accur
        })
    return (ret)
Ejemplo n.º 13
0
matchesThaliana = {}

for i, row in dfSalidaThaliana.iterrows():
    genSnapdragon = row['query acc.ver']
    genThaliana = row['subject acc.ver']
    evalue = row['evalue']

    matchesThaliana[genSnapdragon] = (genThaliana, evalue)

# Ahora sacamos las anotaciones y las guardamos en un mapa

# Tomato
anotacionesTomato = {}

datasetTomato = Dataset(name='slycopersicum_eg_gene',
                        virtual_schema='plants_mart',
                        host='http://plants.ensembl.org')

# Para cada gen, lanzamos una consulta a Biomart
for genSnapdragon in matchesTomato:
    print('tomato')
    genTomato, evalue = matchesTomato[genSnapdragon]
    resultTomato = datasetTomato.query(
        attributes=[
            'ensembl_gene_id', 'ensembl_transcript_id', 'go_id',
            'go_linkage_type', 'namespace_1003'
        ],
        filters={'link_ensembl_transcript_stable_id': genTomato})

    # Si encuentra match:
    if len(resultTomato) > 0:
# anchor = 'A'
# chrom = 'chr4'
# ex 2: DSG3 D + INDEL (1 aa deletion) #
transcript_id = 'ENST00000257189'
junction_coors = [31472788, 31474124]
anchor = 'D'
chrom = 'chr18'
# ex 3: RAB18 NDA + FS #
# transcript_id = 'ENST00000356940'
# [stop_exon, start_exon]
# junction_coors = [27509930,27532507]
# anchor = 'NDA'
# chrom = 'chr10'

# load ensembl dataset with pybiomart
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')
# use fasta to get sequences
ref_fasta = Fasta('/Users/meganrichters/Documents/ref_files/all_sequences.fa')


def get_coding_coordinates(dataset, transcript_id, anchor):
    # get ref info and drop NaN values - ex: exons that are not part of the coding sequence
    ref_tscript_info = dataset.query(attributes=[
        "ensembl_transcript_id", "strand", "transcript_start",
        "transcript_end", "exon_chrom_start", "exon_chrom_end",
        "genomic_coding_start", "genomic_coding_end"
    ],
                                     filters={
                                         'link_ensembl_transcript_stable_id':
                                         [transcript_id]
                                     }).dropna()
Ejemplo n.º 15
0
        return x
    elif not pd.isna(x):
        return x
    else:
        return y


if __name__ == '__main__':

    #
    # [H]omo [S]apiens (9606) - [A]liases
    #

    print('Mapping HS')
    # Query bioMart for Gene Name/Description
    ds_HS = Dataset(name='hsapiens_gene_ensembl',
                    host='http://www.ensembl.org')
    df_HS_G = ds_HS.query(attributes=[
        'ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'description'
    ]).set_index('Gene stable ID')

    rCSVFileCG = "../01-diff-gene-exp/results/HS/HS-DGE_Cyte_vs_Gonia.csv"
    rCSVFileCT = "../01-diff-gene-exp/results/HS/HS-DGE_Tid_vs_Cyte.csv"
    df_HS_CG = pd.read_csv(rCSVFileCG,
                           index_col=0).loc[:, ['logFC', 'logCPM', 'FDR']]
    df_HS_CG.index.name = 'id_gene'
    df_HS_CG.index = df_HS_CG.index.map(lambda x: x.split('.')[0])
    df_HS_CG.columns = [x + '_CyteGonia' for x in df_HS_CG.columns]
    df_HS_CT = pd.read_csv(rCSVFileCT,
                           index_col=0).loc[:, ['logFC', 'logCPM', 'FDR']]
    df_HS_CT.columns = [x + '_TidCyte' for x in df_HS_CT.columns]
    df_HS_CT.index.name = 'id_gene'
Ejemplo n.º 16
0
def import_ndex(network_id, force_update=False):
    """
    Download and process the PPI network directly from ndexbio.org
    :param network_id: String, UUID of the network to download
    :param force_update: Boolean, if true the cached version will be ignored and updated
    :return: String, one line per interaction, seperated by tabs
    """
    ndex_server = 'public.ndexbio.org'

    # --- Check if we can use a cached version
    # Connect to NDEx server anonymously, download metadata and get modification time
    network_metadata = ndex2.client.Ndex2(ndex_server) \
        .get_network_summary(network_id)
    network_modification_time = datetime.fromtimestamp(
        network_metadata['modificationTime'] / 1000.0, tz=timezone.utc)

    # Try and retrieve a cached version. Check if the modification date is within spec, return the cached network
    if not force_update:
        try:
            ppi_network_cache = PpiNetworkCache.objects.get(
                network_id=network_id)
            datetime_now = timezone.now()
            # The network data modification date must be the same as the one just retrieved,
            # the network cache must have been created within the last 24h
            if ppi_network_cache.data_last_modified == network_modification_time and \
                    datetime_now - timedelta(hours=24) <= ppi_network_cache.last_modified:
                print(
                    f'Network cached on {ppi_network_cache.last_modified.isoformat()}'
                )
                return ppi_network_cache.network_string
        except PpiNetworkCache.DoesNotExist:
            # Download and generate network if no cache exists
            pass

    # Import NDEx from server based on UUID
    nice_cx_network = ndex2.create_nice_cx_from_server(server=ndex_server,
                                                       uuid=network_id)

    # --- Create a node_id to gene_id dict which maps from the node_id to the gene_id
    node_to_gene_df = pd.DataFrame([x[1] for x in nice_cx_network.get_nodes()]) \
        .rename({'@id': 'Node ID', 'n': 'Gene name'}, axis='columns')

    # If we are using APID, then we need to use another attribute
    if network_id == '9c38ce6e-c564-11e8-aaa6-0ac135e8bacf':
        node_to_gene_df['Gene name'] = node_to_gene_df['Node ID'].map(
            lambda x: nice_cx_network.get_node_attribute_value(
                x, 'GeneName_A'))

    # --- Create gene_id to other_id dict which maps from gene_id to other ID e.g NCBI IDs
    query_attributes = ['external_gene_name', 'entrezgene_id']
    gene_mapping_df = Dataset(name='hsapiens_gene_ensembl',
                              host='http://www.ensembl.org').query(
                                  attributes=query_attributes).dropna()
    gene_mapping_df.columns = query_attributes
    # set the Gene name (the one used in the networks as ID). Then convert
    # the entrez IDs into int and then to string
    gene_mapping_df = gene_mapping_df \
        .drop_duplicates(subset=['external_gene_name'], keep='first') \
        .set_index('external_gene_name') \
        .astype(int).astype(str)
    # Create the mapping dict
    gene_mapping_dict = gene_mapping_df.to_dict()[
        'entrezgene_id']  # Get the entrez IDs

    # --- Apply gene mapping from gene name to NCBI IDs to the note_to_gene_df and drop missing values
    node_to_gene_df['Gene name'] = node_to_gene_df['Gene name'].map(
        gene_mapping_dict)
    node_to_gene_dict = node_to_gene_df \
        .set_index('Node ID') \
        .dropna() \
        .to_dict()['Gene name']

    # --- Create the network PPI file
    # Iterate over all edges
    result_list = []
    for _, edge in nice_cx_network.get_edges():
        edge_source = edge.get('s')
        edge_target = edge.get('t')
        if edge_source != edge_target:
            # Convert source and target to NCBI IDs and write into string
            try:
                result_list.append(node_to_gene_dict[edge_source] + '\t' +
                                   node_to_gene_dict[edge_target])
            except KeyError:
                # If no mapping can be found, skip this node
                continue

    # --- Save version to cache (db) and return result network string
    result_string = '\n'.join(result_list)
    PpiNetworkCache.objects.update_or_create(network_id=network_id,
                                             defaults={
                                                 'data_last_modified':
                                                 network_modification_time,
                                                 'network_string':
                                                 result_string
                                             })
    return result_string
Ejemplo n.º 17
0
def read_ndex_file_4(fn):
    """
    Given an input string/file, parse the network and return two-column array with interaction partners
    :param fn: Imput NDEx file as string
    :return: Printed as strings, two-column array with interaction partners
    """
    lines6 = ""
    # read edges and nodes into arrays
    if ("edges" in fn.split("nodes")[1]):
        lines5 = fn.split("{\"nodes\":[")
        lines3 = lines5[1].split("{\"edges\":[")[0]
        # remove "cyTableColumn" from array containing edges
        if ("cyTableColumn" in lines5[1]):
            lines4 = lines5[1].split("{\"edges\":[")[1].split(
                "{\"cyTableColumn\":[")[0]
            lines4 = lines4[:-4]
        # take protein name from networkAttributes or nodeAttributes if it is defined there.
        elif ("networkAttributes" in lines5[1]):
            lines4 = lines5[1].split("{\"edges\":[")[1].split(
                "{\"networkAttributes\":[")[0]
            lines4 = lines4[:-4]
            if ("nodeAttributes" in lines5[1].split("{\"edges\":[")[1].split(
                    "{\"networkAttributes\":[")[1]
                    and "UniprotName" in lines5[1].split("{\"edges\":[")
                [1].split("{\"networkAttributes\":[")[1]):
                lines6_temp = \
                    lines5[1].split("{\"edges\":[")[1].split("{\"networkAttributes\":[")[1].split(
                        "{\"nodeAttributes\":[")[
                        1]
                lines6 = lines6_temp.split("{\"edgeAttributes\":[")[0]
        else:
            lines4 = lines5[1].split("{\"edges\":[")[1]
    # check if edge-array comes before node-array in file
    elif ("edges" in fn.split("nodes")[0]):
        lines5 = fn.split("{\"nodes\":[")
        lines3 = lines5[1].split("]},")[0] + "]]]"
        lines4 = lines5[0].split("{\"edges\":[")[1][:-4]
    # lines3 contains the nodes, lines4 the edges, lines6 contains nodeAttributes (information from the ndex file usable for the conversion from node IDs to gene IDs)
    # remove signs to allow automatic json to array conversion
    lines3.replace("@", "")
    lines3.replace("uniprot:", "uniprot")
    lines3.replace("signor:", "signor")
    lines3.replace(" ", "")
    lines3.replace("ncbigene:", "")
    lines3.replace("\\n", "")
    lines33 = lines3[:-3].replace("}]", "")
    node_line = lines33.replace("ncbigene:", "")
    nodelinesplit = node_line.split(", ")
    dictlist = []
    # node dict is later filled with keys (node IDs) and the values are NCBI gene IDs
    node_dict = {}
    if not (node_line.endswith("}")):
        node_line = node_line + "}"
    node_line_2 = "[" + node_line + "]"
    tmp2 = json.loads(node_line_2)
    node_dict_2 = {}
    # iterate over lines in nodeAttributes
    if not (lines6 == ""):
        lines6 = "[" + lines6
        # get array with nodeAttributes for current line
        tmp4 = json.loads(lines6[:-4])
        # if node element has attribute "GeneName_A", then the NCBI ID is given in the nodeAttributes
        for item in tmp4:
            if (item['n'] == "GeneName_A"):
                # use node ID and NCBI ID
                node_dict_2[item['po']] = item['v']
    # print(str(item['po']) + " " + str(item['v']))
    # print(node_dict_2)
    dataset = Dataset(name='hsapiens_gene_ensembl',
                      host='http://www.ensembl.org')
    conv = dataset.query(
        attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id'])
    conv_genelist = conv['Gene name'].tolist()
    for item in tmp2:
        dictlist.append(item)
        # write conversion from node ID to gene ID in dictionary, based on nodeAttributes from the data
        if ('r' in item):
            if (any(c.islower() for c in item['r'])):
                gene_name = item['n']
                if (gene_name in conv_genelist):
                    gene_nbr = conv.index[conv['Gene name'] == gene_name]
                    gene_nbr1 = conv.loc[gene_nbr, 'NCBI gene ID'].values[0]
                    node_dict[item['@id']] = gene_nbr1
            # print(item)
            else:
                node_dict[item['@id']] = item['r']
        # print(item)
        else:
            if (item['n'].isdigit()):
                # if gene ID is in node attributes
                # print(item)
                node_dict[item['@id']] = item['n']
            elif (item['n'] in node_dict_2):
                # otherwise use conversion table to convert gene ID to NCBI ID
                gene_name = node_dict_2[item['n']]
                # print(gene_name)
                if (gene_name in conv_genelist):
                    gene_nbr = conv.index[conv['Gene name'] == gene_name]
                    gene_nbr1 = conv.loc[gene_nbr, 'NCBI gene ID'].values[0]
                    node_dict[item['@id']] = gene_nbr1
        # print(gene_nbr1)
    # print(node_dict)
    # remove signs from string to allow json conversion
    lines4.replace("@", "")
    lines4.replace("uniprot:", "uniprot")
    lines4.replace("signor:", "signor")
    lines4.replace(" ", "")
    lines4 = lines4.replace("]", "")
    edge_line = lines4.rstrip()
    edge_line_2 = "[" + edge_line + "]"
    edgelinesplit = edge_line.split(", ")
    edgelist = []
    tmp4 = json.loads(edge_line_2)
    # get dictionary with gene names and NCBI IDs (entrezgene_id)
    dataset = Dataset(name='hsapiens_gene_ensembl',
                      host='http://www.ensembl.org')
    conv = dataset.query(
        attributes=['ensembl_gene_id', 'external_gene_name', 'entrezgene_id'])
    ret = []
    # convert node IDs in edges to NCBI IDs
    for item in tmp4:
        # print(item)
        if (item['s'] in node_dict and item['t'] in node_dict):
            source = node_dict[item['s']]
            target = node_dict[item['t']]
            # print(source)
            # print(target)
            if (source != target and not (math.isnan(float(source)))
                    and not (math.isnan(float(target)))):
                baz = [str(int(source)), str(int(target))]
                ret.append("\t".join(baz))
    # print("\n".join(ret))
    return ("\n".join(ret))
Ejemplo n.º 18
0
        #     continue
        # fasta.write( ">"+svid+"."+read.query_name+"\n")
        # fasta.write(read.seq+"\n")

    fasta.close()
    bamfile.close()


########################################   Main code   ########################################

print("Start:", datetime.datetime.now())

EnsemblRestClient = EnsemblRestClient()

### DOWNLOAD BASIC GENE INFORMATION FROM ENSEMBL (ID, CHROMOSOME, POSITION, STRAND, BIOTYPE)
dataset = Dataset(name='hsapiens_gene_ensembl',
                  host='http://grch37.ensembl.org')
ensembl_genes = dataset.query(attributes=[
    'ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position',
    'strand', 'gene_biotype'
],
                              filters={
                                  'chromosome_name': [
                                      '1', '2', '3', '4', '5', '6', '7', '8',
                                      '9', '10', '11', '12', '13', '14', '15',
                                      '16', '17', '18', '19', '20', '21', '22',
                                      'X', 'Y', 'MT'
                                  ]
                              })
regions = []

for line in ensembl_genes.iterrows():
Ejemplo n.º 19
0
        },
        'enterocyte': {
            'HS': enterocyte_pca_modules_hs,
            'MM': enterocyte_pca_modules_mm,
            'DM': enterocyte_pca_modules_dm,
        }
    }
    modules = data[celltype][layer]

    # for specie in species
    print('Calculating GOEA on {celltype:s} {network:s} {threshold:.1f} {layer:s}'.format(celltype=celltype, network=network, threshold=threshold, layer=layer))

    # Load Gene Population
    print("Load gene population (from biomart)")
    datamart_name = dict_datamart_names[layer]
    ds = Dataset(name=datamart_name, host='http://www.ensembl.org')
    if layer == 'DM':
        attributes = ['ensembl_gene_id', 'uniprotswissprot', 'external_gene_name']
    elif layer == 'MM':
        attributes = ['ensembl_gene_id', 'uniprotswissprot', 'mgi_id', 'external_gene_name']
    elif layer == 'HS':
        attributes = ['ensembl_gene_id', 'uniprotswissprot', 'hmmpanther', 'external_gene_name']
    dfQ = ds.query(attributes=attributes).set_index('Gene stable ID')

    # Population of genes (background) to test against
    if layer == 'DM':
        pop_flybase = set(dfQ.index.tolist())
        pop_uniprot = set(dfQ['UniProtKB/Swiss-Prot ID'].dropna().tolist())
        pop = pop_flybase.union(pop_uniprot)
    elif layer == 'MM':
        pop_mgi = set(dfQ['MGI ID'].dropna().tolist())
Ejemplo n.º 20
0
from pybiomart import Server
server = Server(host='http://www.ensembl.org')
server.list_marts()
mart = server['ENSEMBL_MART_ENSEMBL']

from pybiomart import Dataset
dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')

dataset.query(attributes=['ensembl_gene_id', 'external_gene_name'],
              filters={'chromosome_name': ['1', '2']})


def attributes(self):

    if self._attributes is None:
        self._filters, self._attributes = self._fetch_configuration()
    return self._attributes


dataset.attributes
dataset.list_attributes()


def filters(self):
    if self._filters is None:
        self._filters, self._attributes = self._fetch_configuration()
    return self._filters


dataset.filters
dataset.list_filters()
Ejemplo n.º 21
0
    #
    datamart_names = {
        'HS': 'hsapiens_gene_ensembl',
        'MM': 'mmusculus_gene_ensembl',
        'DM': 'dmelanogaster_gene_ensembl'
    }

    r = []
    for specie in species:

        print("Calculating for species: {specie:s}".format(specie=specie))

        print("Querying Datamart")
        datamart_name = datamart_names[specie]
        ds = Dataset(name=datamart_name, host='http://www.ensembl.org')
        dfQ = ds.query(attributes=[
            'ensembl_gene_id', 'external_gene_name', 'gene_biotype'
        ]).set_index('Gene stable ID')
        #
        n_genome = len(dfQ)
        dfQpc = dfQ.loc[(dfQ['Gene type'] == 'protein_coding'), :]
        n_genome_pc = len(dfQpc)
        DFQnpc = dfQ.loc[(dfQ['Gene type'] != 'protein_coding'), :]
        n_genome_non_pc = len(DFQnpc)
        print('done.')

        for celltype in celltypes:

            print("Calculating for celltype: {celltype:s}".format(
                celltype=celltype))