Beispiel #1
0
def split_gtf(gtf, outdir, novel=False):
    gtf_df = gtfparse.read_gtf(gtf)
    if 'gene_type' in gtf_df.columns:
        gtf_df.loc[:, 'gene_biotype'] = gtf_df.gene_type
        gtf_df.drop('gene_type', axis=1, inplace=True)
    elif 'gene_biotype' in gtf_df.columns:
        pass
    else:
        gtf_df.loc[:, 'gene_biotype'] = 'protein_coding'

    type_label = 'gene_biotype'

    if novel:
        gtf_df.loc[
            :, type_label] = gtf_df.loc[:, type_label].map(
                GENCODE_CATEGORY_MAP)
    else:
        gtf_df.loc[
            :, type_label] = gtf_df.loc[:, type_label].map(
            simplify_gene_type)

    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    for gt, grp in gtf_df.groupby(type_label):
        gt_file = outdir / f'{gt}.gtf'
        with open(gt_file, 'w') as gt_inf:
            for idx in grp.index:
                outline = dfline2gtfline(grp.loc[idx])
                gt_inf.write(outline)
Beispiel #2
0
def ensp_to_hugo_map(datastore="./data"):
    """
    You should download the file Homo_sapiens.GRCh38.95.gtf from :
    ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz

    Store the file in datastore
    """
    savefile = datastore + "/datastore/ensp_ensg_df.pkl"

    # If df is already stored, return the corresponding dictionary
    if os.path.isfile(savefile):
        f = open(savefile, 'rb')
        df = pickle.load(f)
        f.close()
    else:
        df = read_gtf(datastore + "/datastore/Homo_sapiens.GRCh38.95.gtf")
        df = df[df['protein_id'] != ''][['gene_id',
                                         'protein_id']].drop_duplicates()
        df.to_pickle(savefile)

    # ENSG to hugo map
    with open(datastore + "/datastore/ensembl_map.txt") as csv_file:
        next(csv_file)  # Skip first line
        csv_reader = csv.reader(csv_file, delimiter='\t')
        ensg_map = {row[1]: row[0] for row in csv_reader if row[0] != ""}

    # ENSP to hugo map
    ensmap = {}
    for index, row in df.iterrows():
        if row['gene_id'] in ensg_map.keys():
            ensmap[row['protein_id']] = ensg_map[row['gene_id']]

    return ensmap
Beispiel #3
0
def process_gene_annot(fpath, outPath):
    """
    :param fpath: string representing path to file
    :param outPath: string representing path for output file
    :return output: df containing headers=[chr, gene_id, genename, start, end]

    Note:
    -----
    use gtfparse to load gtf file. https://github.com/openvax/gtfparse
    """
    # load data
    geneDf = read_gtf(fpath)

    # retrieve genes only
    df_genes = geneDf[geneDf["feature"] == "gene"]

    # select wanted columns
    cols = ['seqname', 'gene_id', 'transcript_name', 'start', 'end']
    subdf_genes = df_genes[cols]

    # retrieve chr str
    chrStr = subdf_genes['seqname'].str.split('chr', n=1, expand=True)
    subdf_genes['chr'] = chrStr[1]

    # drop seqname and keep chr column
    sub_gene = subdf_genes.drop(['seqname'], axis=1)

    # reorder columns
    sub_gene = sub_gene[['chr', 'gene_id', 'transcript_name', 'start', 'end']]
    sub_gene.columns = ['chr', 'gene_id', 'genename', 'start', 'end']

    return sub_gene
Beispiel #4
0
 def find_bio_type(filePath):
     # open the file
     df = read_gtf(filePath)
     # filter of info based in the columns
     # translate a DateFrame in array for manipulation
     source_array = df["gene_biotype"].__array__()
     return set(source_array)
Beispiel #5
0
def main():
    parser = argparse.ArgumentParser(description="""

    python ExonUnion.py Calculate the union of the exons of a list
    of transcript.

    chr10   27035524        27150016        ABI1    76      -       NM_001178120    10006   protein-coding  abl-interactor 1        27037498        27149792        10      27035524,27040526,27047990,27054146,27057780,27059173,27060003,27065993,27112066,27149675,      27037674,27040712,27048164,27054247,27057921,27059274,27060018,27066170,27112234,27150016,
""")

    parser.add_argument("--gtf_file")
    # parser.add_argument("--transcript_bed")
    # parser.add_argument('-o', '--options', default='yo',
    # help="Some option", type='str')
    # parser.add_argument('-u', '--useless', action='store_true',
    # help='Another useless option')
    args = parser.parse_args()

    # returns GTF with essential columns such as "feature", "seqname", "start", "end"
    # alongside the names of any optional keys which appeared in the attribute column
    df = read_gtf(args.gtf_file)

    # filter DataFrame to gene entries on chrY
    #df_transcripts = df[df["feature"] == "transcript"]
    #df_transcripts = df_transcripts[df_transcripts['gene_type'] == 'protein_coding']
    #df_transcripts = df[df["transcript_name"] == "SAMD11-201"]
    #df_transcripts = df[df["gene_name"] == "AC114490.2-201"]
    df_transcripts = df[df["gene_id"] == "ENSG00000163867.17"]
    #df_transcripts = df.head()
    # gene_id = "ENST00000445297"
    # df_transcripts = df[df["transcript_id"].str.contains(gene_id)]
    #df_genes_chrY = df_genes[df_genes["seqname"] == "Y"]

    print(df_transcripts.to_string())
    print("--")
Beispiel #6
0
def load_transcript_fpkm_dict_from_gtf(
        gtf_path,
        transcript_id_column_name="reference_id",
        fpkm_column_name="FPKM",
        feature_column_name="feature"):
    """
    Load a GTF file generated by StringTie which contains transcript-level
    quantification of abundance. Returns a dictionary mapping Ensembl
    IDs of transcripts to FPKM values.
    """
    df = gtfparse.read_gtf(gtf_path,
                           column_converters={fpkm_column_name: float})
    transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, df)
    fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, df)
    features = _get_gtf_column(feature_column_name, gtf_path, df)
    logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path))
    logging.info("Found %s transcript entries" % sum(feature == "transcript"
                                                     for feature in features))
    result = {
        transcript_id: float(fpkm)
        for (transcript_id, fpkm,
             feature) in zip(transcript_ids, fpkm_values, features)
        if ((transcript_id is not None) and (len(transcript_id) > 0) and (
            feature == "transcript"))
    }
    logging.info("Keeping %d transcript rows with reference IDs" %
                 (len(result), ))
    return result
Beispiel #7
0
def parse_expression_file(args, vcf_reader, vcf_writer):
    if args.format == 'stringtie' and args.mode == 'transcript':
        df_all = read_gtf(args.expression_file)
        df = df_all[df_all["feature"] == "transcript"]
        id_column = resolve_stringtie_id_column(args, df.columns.values)
    else:
        id_column = resolve_id_column(args)
        df = pd.read_csv(args.expression_file, sep='\t')
    if args.ignore_ensembl_id_version:
        df['transcript_without_version'] = df[id_column].apply(
            lambda x: re.sub(r'\.[0-9]+$', '', x))
    expression_column = resolve_expression_column(args)
    if expression_column not in df.columns.values:
        vcf_reader.close()
        vcf_writer.close()
        raise Exception(
            "ERROR: expression_column header {} does not exist in expression_file {}"
            .format(expression_column, args.expression_file))
    if id_column not in df.columns.values:
        vcf_reader.close()
        vcf_writer.close()
        raise Exception(
            "ERROR: id_column header {} does not exist in expression_file {}".
            format(id_column, args.expression_file))
    return df, id_column, expression_column
Beispiel #8
0
 def find_features(filePath):
     # open the file
     df = read_gtf(filePath)
     # filter of info based in the columns
     # translate a DateFrame in array for manipulation
     feature = df["feature"].__array__()
     return set(feature)
    def load_annotation(self, annotation_path):
        """
        A method for loading annotation using gtfparse library
        """
        annotation_type = path_features(annotation_path)["extension"]

        try:
            log.debug("Loading "+annotation_path)

            if (annotation_type == "gtf"):
                # disable root log due to gtfparse logging interference
                log_root = logging.getLogger("")
                log_root.disabled = True
                # read gtf
                annotation_df = gtfparse.read_gtf(annotation_path)
                # enable root log
                log_root.disabled = False

            # limit only to a feature of interest
            annotation_df[annotation_df["feature"] == "exon"]
            # trim to required cols only
            annotation_df = annotation_df[["seqname", "start", "end", "strand", "gene_id", "transcript_id"]]

        except Exception as e:
            raise ValueError("An error occured while loading annotation: "+str(e))

        annotation_df["start"] -= 1
        self.annotation_df = {"whole": annotation_df}
Beispiel #10
0
def display_gtf_geneids(gtffile: str,
                        feature_type: Optional[List[str]] = None):
    """Display the geneids present in a GTF/GFF

    Parameters
    ----------
    gtffile : `str`
        GTF/GFF file that matches the input FASTA file.  Preferably one from Ensembl/GENCODE.
        Gzipped GTF/GFF files are acceptable, though their use may impose a performance penality.
    feature_type :

    Returns
    -------
    `None`
    """

    gtf = read_gtf(gtffile)

    if feature_type is not None:
        gtf = gtf[gtf.feature == feature_type]

    gene_set = gtf["gene_id"].unique()

    print(f"{len(gene_set)} genes found.  These include:")
    for _ in gene_set:
        print(_)
def main(gtf, out_dir, linc):
    gtf_df = read_gtf(gtf)
    linc_genes = gtf_df[gtf_df.transcript_biotype ==
                        'lincRNA'].gene_id.unique()
    gtf_df.gene_biotype.replace(gtf_tools['dict_GENCODE_CATEGORY_MAP'],
                                inplace=True)
    gtf_df.transcript_biotype.replace(gtf_tools['dict_GENCODE_CATEGORY_MAP'],
                                      inplace=True)
    if 'gene_name' in gtf_df.columns:
        mask = (gtf_df.gene_name == "")
        gtf_df.loc[mask, 'gene_name'] = gtf_df.loc[mask, 'gene_id']
    else:
        gtf_df.loc[:, 'gene_name'] = gtf_df.loc[:, 'gene_id']
    if linc:
        gtf_df = gtf_df.set_index('gene_id')
        gtf_df.loc[linc_genes, 'gene_biotype'] = 'lincRNA'
        gtf_df.gene_biotype.replace({'lncRNA': 'genic_lncRNA'}, inplace=True)
        gtf_df = gtf_df.reset_index()
    gene_df = gtf_df[gtf_df.gene_id != ""]
    gene_type_df = gene_df.loc[:, ['gene_id', 'gene_name', 'gene_biotype'
                                   ]].drop_duplicates()
    gene_type_file = os.path.join(out_dir, 'gene_type.txt')
    gene_type_df.to_csv(gene_type_file, sep='\t', index=False)
    tr_df = gtf_df[gtf_df.transcript_id != ""]
    tr_type_df = tr_df.loc[:, [
        'transcript_id', 'gene_id', 'gene_name', 'transcript_biotype',
        'gene_biotype'
    ]].drop_duplicates()
    tr_type_file = os.path.join(out_dir, 'transcript_type.txt')
    tr_type_df.to_csv(tr_type_file, sep='\t', index=False)
Beispiel #12
0
 def find_gene(filePath):
     # open the file
     df = read_gtf(filePath)
     # filter of info based in the columns
     # translate a DateFrame in array for manipulation
     source_array = df[df["feature"] == 'gene'].__array__()
     return source_array
Beispiel #13
0
 def find_chromossome(filePath):
     # open the file
     df = read_gtf(filePath)
     # filter of info based in the columns
     # translate a DateFrame in array for manipulation
     source_array = df["seqname"].__array__()
     return set(source_array)
Beispiel #14
0
def main(gtf_file, compare_table, cov_file, species):
    gtf_df = gtfparse.read_gtf(gtf_file)
    gene_df = gtf_df[gtf_df.feature == 'gene']
    gene_type_df = gene_df.loc[:,
                               ['gene_id', 'gene_biotype']].drop_duplicates()
    gene_type_df = gene_type_df.set_index('gene_id')
    gene_type_df.gene_biotype.replace(gtf_tools['dict_GENCODE_CATEGORY_MAP'],
                                      inplace=True)
    gene_type_counts = gene_type_df.gene_biotype.value_counts()
    compare_table_df = pd.read_table(compare_table)
    assembly_genes = list()
    ref_assembly_df = compare_table_df[compare_table_df.category_relative ==
                                       'exonic_overlap']
    for each in ref_assembly_df.ref_gene_id:
        assembly_genes.extend(each.split(','))
    assembly_genes = list(set(assembly_genes))
    assembly_gene_type_df = gene_type_df.loc[assembly_genes]
    assembly_gene_type_counts = assembly_gene_type_df.gene_biotype.value_counts(
    )
    merged_df = pd.concat([gene_type_counts, assembly_gene_type_counts],
                          axis=1)
    merged_df.columns = ['referrence', 'assembly']
    merged_df.loc[:, 'coverage'] = merged_df.assembly / merged_df.referrence
    merged_df.loc[:, 'species'] = species
    merged_df.to_csv(cov_file, sep='\t', header=False)
Beispiel #15
0
def extract_landmarks(gtf, landmarks=ALL_LANDMARKS):
    """Given an gene annotation GFF/GTF file,

    # Arguments
        gtf: File path or a loaded `pd.DataFrame` with columns:
    seqname, feature, start, end, strand
        landmarks: list or a dictionary of landmark extractors (function or name)

    # Note
        When landmark extractor names are used, they have to be implemented in
    the module `concise.preprocessing.position`

    # Returns
        Dictionary of pd.DataFrames with landmark positions
    (columns: seqname, position, strand)
    """
    if isinstance(gtf, str):
        _logger.info("Reading gtf file..")
        gtf = read_gtf(gtf)
        _logger.info("Done")

    _logger.info("Running landmark extractors..")
    # landmarks to a dictionary with a function
    assert isinstance(landmarks, (list, tuple, set, dict))
    if isinstance(landmarks, dict):
        landmarks = {k: _get_fun(v) for k, v in landmarks.items()}
    else:
        landmarks = {
            _to_string(fn_str): _get_fun(fn_str)
            for fn_str in landmarks
        }

    r = {k: _validate_pos(v(gtf)) for k, v in landmarks.items()}
    _logger.info("Done!")
    return r
Beispiel #16
0
def bed12_process():
    #file_path = "/home/zyang/Project/CRC/step49_track/gencode.vM25.basic.annotation.gtf"
    file_path = "/data3/zhaochen/project/colon_cancer/colon_chip/genomeTrack/gencode.vM25.basic.annotation.gtf"
    df = read_gtf(file_path)
    print(df.columns)
    print(df[0:10])
    df_genes = df[df["feature"] == "transcript"]
    df_sub = df_genes[['gene_name', 'transcript_id']]
    df_sub.to_csv("gene_name_and_transcriptID.txt", sep="\t", index=False)

    #bed_file = "/home/zyang/Project/CRC/step49_track/gencode.vM25.basic.annotation.sort.bed12"
    bed_file = "/data3/zhaochen/project/colon_cancer/colon_chip/genomeTrack/gencode.vM25.basic.annotation.sort.bed12"
    bed_df = pd.read_csv(bed_file, sep="\t",\
                names=["chr", "start", "end", "name", "score", "strand", "thick_start", "thick_end",\
                 "rgb", "block_count", "block_size", "block_start"])
    new_bed = bed_df.merge(df_sub,
                           left_on="name",
                           right_on="transcript_id",
                           how="left")
    new_bed =new_bed[["chr", "start", "end", "gene_name", "score", "strand", "thick_start", "thick_end",\
                 "rgb", "block_count", "block_size", "block_start"]]
    new_bed.to_csv("sorted.changeName.bed",
                   sep="\t",
                   index=False,
                   header=False)
Beispiel #17
0
def parse_gtf(fileParse, feature, fileDest):
    df = read_gtf(fileParse)
    # e.g df = read_gtf('C:/Users/breno/Desktop/Homo_sapiens.GRCh38.91.gtf')

    # filter of info based in the columns
    # df_genes = df[df["feature"] == "gene"]
    # df_exons = df[df["feature"] == "exon"]
    # df_introns = df[df["feature"] == "intron"]
    # df_cds = df[df["feature"] == "CDS"]
    # df_genes = df[df["feature"] == "gene"]

    # filter of info based in the columns
    # translate a DateFrame in array for manipulation
    str_genes = df[df["feature"] == feature].__array__()

    # str_genes = df[df["feature"] == "gene"].__array__()

    # get quantity of the genes and storage in the variable tam
    tam = len(str_genes)

    # open the file for write
    files_gene = open(fileDest, 'w')
    # e.g files_gene = open('C:/Users/breno/Desktop/gene.txt', 'w')

    # iterate by array and write in the file
    for i in range(0, tam - 1):
        for j in range(0, 25):
            # write the feature one an one
            # before insert a space bar for count the number de attr
            files_gene.write(str_genes[i][j].__str__())
            files_gene.write('|')
        # jump line in the file before write a position of the array
        files_gene.write('\n')
    files_gene.close()
Beispiel #18
0
 def load_dataframe(self, file_resources):
     # Parse lncRNA gtf
     df = read_gtf(file_resources["long_noncoding_RNAs.gtf"]
                   )  # Returns a dask dataframe
     df['gene_id'] = df['gene_id'].str.replace(
         "[.].*", "")  # Removing .# ENGS gene version number at the end
     df['transcript_id'] = df['transcript_id'].str.replace("[.].*", "")
     return df
Beispiel #19
0
def pickle_annotations(gtf_path, columns: list, features=['exon']):
    annotations = read_gtf(
        gtf_path,
        features=features).filter(
            columns).sort_values(by=['seqname', 'transcript_id', 'exon_number'])

    annotations.to_pickle("annotations.pkl")
    sys.exit()
Beispiel #20
0
def load_exons(model_gtf):
    """load GTF exons into a list of Series objects of exons"""
    gtf_df = read_gtf(model_gtf)
    gtf_df = gtf_df.loc[gtf_df.feature == 'exon']
    if len(gtf_df) == 0:
        raise GtfException("no exon records found")
    fixup_gtf_attrs(gtf_df)
    return [gtf_df.iloc[i] for i in range(len(gtf_df))]
Beispiel #21
0
    def _load_gtf_as_dataframe(self, usecols=None, features=None):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        logger.info("Reading GTF from %s", self.gtf_path)
        df = read_gtf(
            self.gtf_path,
            column_converters={
                "seqname": normalize_chromosome,
                "strand": normalize_strand,
            },
            infer_biotype_column=True,
            usecols=usecols,
            features=features)

        column_names = set(df.keys())
        expect_gene_feature = features is None or "gene" in features
        expect_transcript_feature = features is None or "transcript" in features
        observed_features = set(df["feature"])

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if expect_gene_feature and "gene" not in observed_features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            logger.info("Creating missing gene features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"gene": "gene_id"},
                extra_columns={
                    "gene": {
                        "gene_name",
                        "gene_biotype"
                    }.intersection(column_names),
                },
                missing_value="")
            logger.info("Done.")

        if expect_transcript_feature and "transcript" not in observed_features:
            logger.info("Creating missing transcript features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
            logger.info("Done.")

        return df
Beispiel #22
0
def gtf():
    # save a smaller version of the annotation
    # from concise.preprocessing.landmarks import read_gtf
    # gtf_path = "/s/genomes/human/hg38/GRCh38.p7/gencode.v25.annotation.gtf"
    # gtf = read_gtf(gtf_path)
    # gtf_small = gtf[gtf.seqnames == "chr22"]
    # gtf_small.to_pickle("data/gencode_v25_chr22.gtf.pkl.gz")  # 116k

    return read_gtf("data/gencode.v24.annotation_chr22.gtf.gz")
Beispiel #23
0
    def read_gene_gtf_file(self):
        # ['start','end','gene_name','gene_id','seqname','exon_number','feature']
        data_frame = read_gtf(
            self.gene_gtf_file_name,
            usecols=['start', 'end', 'seqname', 'feature', 'gene_biotype'])

        data = data_frame.query("feature == 'gene'")
        data = data.reset_index(drop=True)
        data.to_csv("gene_file.csv")
Beispiel #24
0
def test_ensembl_gtf_gene_names_with_usecols_gzip():
    df = read_gtf(ENSEMBL_GTF_PATH + ".gz", usecols=["gene_name"])
    gene_names = set(df["gene_name"])
    assert gene_names == EXPECTED_GENE_NAMES, \
        "Wrong gene names: %s, missing %s and unexpected %s" % (
            gene_names,
            EXPECTED_GENE_NAMES.difference(gene_names),
            gene_names.difference(EXPECTED_GENE_NAMES)
        )
def identify_transcripts(gtf_file, regtools_file, sample_name):
    """Filter GTF tsv file to find tumor junction coordinates from regtools

    Args:
        gtf_file (string): path to gtf file
        regtools_file (string): path to (filtered) regtools output excel file

    Returns:
        junctions.gtf (file): filtered gtf only containing transcripts that correspond to regtools junctions
        transcripts.fa (file): fasta file with coding transcript sequences corresponding to junctions.gtf file
        gtf_transcripts (df): list of altered transcripts
    """
    # convert gtf to pd df
    gtf_data = read_gtf(gtf_file)
    # read in regtools significant junctions as pd df
    junctions = pd.read_excel(regtools_file, sheet_name='Sheet1')
    junctions = junctions.loc[junctions['Sample'] == sample_name]
    print(junctions)
    total_transcripts = {}
    for row_index, row in junctions.iterrows():
        start_exons = gtf_data.loc[(gtf_data['end'] == row["start"])
                                   & (gtf_data["seqname"] == row["chrom"])]
        stop_exons = gtf_data.loc[(gtf_data['start'] == row["end"])
                                  & (gtf_data["seqname"] == row["chrom"])]
        print(start_exons, stop_exons)
        transcript_dict = {
            t: row["gene_names"]
            for t in list(start_exons['transcript_id'])
            if t in list(stop_exons['transcript_id'])
        }
        total_transcripts.update(transcript_dict)
    gtf_transcripts = gtf_data.loc[
        (gtf_data["transcript_id"].isin(total_transcripts.keys()))
        & (gtf_data["feature"] == "transcript")]
    gtf_transcripts["gene"] = [
        total_transcripts[x] for x in gtf_transcripts["transcript_id"]
    ]
    # filter gtf_transcripts
    if "transcript_type" in gtf_transcripts.columns:
        gtf_transcripts = gtf_transcripts.loc[
            gtf_transcripts["transcript_type"] == "protein_coding"]
    # write subsetted gtf file
    write_file = open("junctions.gtf", "w")
    for item, gene in zip(list(gtf_transcripts["transcript_id"]),
                          list(gtf_transcripts["gene"])):
        for line in open(gtf_file).readlines():
            if re.search(item, line):
                new_line = line.strip() + f' gene_name "{gene}";\n'
                write_file.write(new_line)
    write_file.close()
    # create BED12 file from junctions.gtf
    # create fasta file with transcript (exon-only) sequences
    subprocess.Popen(
        "gtfToGenePred junctions.gtf test.genePhred && genePredToBed test.genePhred results.bed && bedtools getfasta -fi ~/Documents/ref_fasta/GRCh38.d1.vd1.fa -fo transcripts.fa -bed results.bed -split -name -s && sed -i.bak 's/(-)//;s/(+)//' transcripts.fa",
        shell=True)
    return gtf_transcripts
Beispiel #26
0
def gene_loc_inf(gtf_file, outfile):
    gtf_df = gtfparse.read_gtf(gtf_file)
    gene_chrom = gtf_df.groupby(['gene_id'])['seqname'].first()
    gene_chrom.name = 'chrom'
    gene_start = gtf_df.groupby(['gene_id'])['start'].min()
    gene_end = gtf_df.groupby(['gene_id'])['end'].max()
    gene_strand = gtf_df.groupby(['gene_id'])['strand'].first()
    gene_loc_df = pd.concat([gene_chrom, gene_start, gene_end, gene_strand],
                            axis=1)
    gene_loc_df.to_csv(outfile, sep='\t')
Beispiel #27
0
    def _load_gtf_as_dataframe(self, usecols=None, features=None):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        logger.info("Reading GTF from %s", self.gtf_path)
        df = read_gtf(self.gtf_path,
                      column_converters={
                          "seqname": normalize_chromosome,
                          "strand": normalize_strand,
                      },
                      infer_biotype_column=True,
                      usecols=usecols,
                      features=features)

        column_names = set(df.keys())
        expect_gene_feature = features is None or "gene" in features
        expect_transcript_feature = features is None or "transcript" in features
        observed_features = set(df["feature"])

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if expect_gene_feature and "gene" not in observed_features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            logger.info("Creating missing gene features...")
            df = create_missing_features(dataframe=df,
                                         unique_keys={"gene": "gene_id"},
                                         extra_columns={
                                             "gene":
                                             {"gene_name", "gene_biotype"
                                              }.intersection(column_names),
                                         },
                                         missing_value="")
            logger.info("Done.")

        if expect_transcript_feature and "transcript" not in observed_features:
            logger.info("Creating missing transcript features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
            logger.info("Done.")

        return df
Beispiel #28
0
def main(gtf, output):
    gtf_df = gtfparse.read_gtf(gtf)
    gtf_exon_df = gtf_df[gtf_df.feature == 'exon']
    gtf_exon_df.loc[:, 'exon_len'] = gtf_exon_df.end - gtf_exon_df.start + 1
    tr_len = gtf_exon_df.groupby(['transcript_id'])['exon_len'].sum()
    tr_gene = gtf_exon_df.loc[:,
                              ['transcript_id', 'gene_id']].drop_duplicates()
    tr_gene = tr_gene.set_index('transcript_id')
    tr_gene_len = pd.concat([tr_len, tr_gene], axis=1)
    gene_len = tr_gene_len.groupby(['gene_id'])['exon_len'].median()
    gene_len.to_csv(output, header=False, sep='\t')
def main(input_gtf, in_silico_circ, real_circ_table, gene_exp):
    real_circ_df = pd.read_table(real_circ_table)
    real_circ_df.loc[:, 'chrom'] = real_circ_df.chrom.astype(str)
    real_circ_df = real_circ_df.set_index(['chrom', 'start', 'end'])
    circ_num = len(real_circ_df)
    is_circ_dict = OrderedDict()
    gtf_df = gtfparse.read_gtf(input_gtf)
    exon_df = gtf_df[gtf_df.feature == 'exon']
    exon_df = exon_df.set_index('transcript_id')
    exon_df = exon_df.sort_values(['seqname', 'start'])
    exon_df.loc[:, 'start_0base'] = exon_df.start - 1
    # gene exp cat df
    if gene_exp is not None:
        gene_exp_df = pd.read_table(gene_exp)
        exp_genes_df = gene_exp_df[gene_exp_df.tpm >= 10]
        exon_df = exon_df[exon_df.gene_id.isin(exp_genes_df.Gene_id)]
    for each_tr in exon_df.index.unique():
        # in silico circRNA from same transcript set
        if each_tr not in real_circ_df.isoformName.unique():
            continue
        each_tr_exons = exon_df.loc[each_tr]
        each_tr_introns = get_introns(each_tr_exons)
        if each_tr_introns is None:
            continue
        exon_num = len(each_tr_exons)
        chrom = str(each_tr_exons.iloc[0].seqname)
        strand = each_tr_exons.iloc[0].strand
        gene = each_tr_exons.iloc[0].gene_id
        for each_com in itertools.combinations_with_replacement(
                range(exon_num), 2):
            start = each_tr_exons.iloc[each_com[0]].start_0base
            end = each_tr_exons.iloc[each_com[1]].end
            # filter real circRNA from in silico circRNA
            if (chrom, start, end) in real_circ_df.index:
                continue
            flank_intron = get_flank_intron(chrom, each_com, each_tr_introns)
            is_circ_dict.setdefault('chrom', []).append(chrom)
            is_circ_dict.setdefault('start', []).append(start)
            is_circ_dict.setdefault('end', []).append(end)
            is_circ_dict.setdefault('strand', []).append(strand)
            is_circ_dict.setdefault('flankIntron', []).append(flank_intron)
            is_circ_dict.setdefault('isoformName', []).append(each_tr)
            is_circ_dict.setdefault('geneID', []).append(gene)
    is_circ_df = pd.DataFrame(is_circ_dict)
    np.random.seed(0)
    selected_circ = np.random.choice(is_circ_df.index.values, circ_num)
    is_circ_df = is_circ_df.loc[selected_circ]
    is_circ_df.loc[:, 'circRNAID'] = [
        'in_silico_circ_{num:0>10}'.format(num=each + 1)
        for each in range(len(is_circ_df))
    ]
    is_circ_df = is_circ_df.set_index('circRNAID')
    is_circ_df.to_csv(in_silico_circ, sep='\t')
Beispiel #30
0
def load_annotations(gtf_path, columns: list, features=['exon']):
    if not __debug__:
        # load from pickled object
        annotations = pd.read_pickle(gtf_path)
        return annotations

    # NOTE: GENCODE data format;
    # https://www.gencodegenes.org/pages/data_format.html
    annotations = read_gtf(gtf_path
                           ).query("feature == {}".format(features)).filter(
        columns).sort_values(by=['seqname', 'transcript_id', 'exon_number'])

    return annotations
Beispiel #31
0
    def __init__(self,
                 intervals_file,
                 fasta_file,
                 gtf_file,
                 filter_protein_coding=True,
                 target_file=None,
                 use_linecache=True):
        if sys.version_info[0] != 3:
            warnings.warn(
                "Only Python 3 is supported. You are using Python {0}".format(
                    sys.version_info[0]))
        self.gtf = read_gtf(gtf_file)

        self.filter_protein_coding = filter_protein_coding

        if self.filter_protein_coding:
            if "gene_type" in self.gtf:
                self.gtf = self.gtf[self.gtf["gene_type"] == "protein_coding"]
            elif "gene_biotype" in self.gtf:
                self.gtf = self.gtf[self.gtf["gene_biotype"] ==
                                    "protein_coding"]
            else:
                warnings.warn(
                    "Gtf doesn't have the field 'gene_type' or 'gene_biotype'. Considering genomic landmarks"
                    + "of all genes not just protein_coding.")

        if not np.any(self.gtf.seqname.str.contains("chr")):
            self.gtf["seqname"] = "chr" + self.gtf["seqname"]

        # intervals
        if use_linecache:
            self.bt = BedToolLinecache(intervals_file)
        else:
            self.bt = BedTool(intervals_file)

        # extractors
        self.fasta_file = fasta_file
        self.seq_extractor = None
        self.dist_extractor = None

        # here the DATALOADER_DIR contains the path to the current directory
        self.dist_transformer = DistanceTransformer(
            ALL_LANDMARKS,
            DATALOADER_DIR + "/dataloader_files/position_transformer.pkl")

        # target
        if target_file:
            self.target_dataset = TxtDataset(target_file)
            assert len(self.target_dataset) == len(self.bt)
        else:
            self.target_dataset = None