def expected_result_same_strand_intersection_simple_granges(): c = """Chromosome Start End Strand Score chr1 6 7 - 7""" df = pd.read_table(StringIO(c), sep="\s+", header=0) return PyRanges(df)
def read_gtf(f): """seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below. # source - name of the program that generated this feature, or the data source (database or project name) feature - feature type name, e.g. Gene, Variation, Similarity start - Start position of the feature, with sequence numbering starting at 1. end - End position of the feature, with sequence numbering starting at 1. score - A floating point value. strand - defined as + (forward) or - (reverse). # frame - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on.. attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature.""" dtypes = {"Chromosome": "category", "Feature": "category", "Strand": "category"} df = pd.read_table(f, sep="\t", comment="#", usecols=[0, 2, 3, 4, 5, 6, 8], header=None, names="Chromosome Feature Start End Score Strand Attribute".split(), dtype=dtypes) if sum(df.Score == ".") == len(df): cols_to_concat = "Chromosome Start End Strand Feature".split() else: cols_to_concat = "Chromosome Start End Strand Feature Score".split() extract = _fetch_gene_transcript_exon_id(df.Attribute) print("extract") print(extract) extract.columns = "GeneID TranscriptID ExonNumber ExonID".split() extract.ExonNumber = extract.ExonNumber.astype(float) df = pd.concat([df[cols_to_concat], extract], axis=1) return PyRanges(df)
def expected_result_unstranded(): c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance 0 chr1 3 6 h 0 + 6 7 f 0 - 1 1 chr1 5 7 h 0 - 6 7 f 0 - 0""" return PyRanges(pd.read_table(StringIO(c), sep=" "))
def hyp4(): c = """chr1 0 3 + chr1 3 4 + chr1 1 2 +""" return PyRanges(pd.read_table(StringIO(c), sep="\s+", header=None, names="Chromosome Start End Strand".split()))
def expected_result_subtract_simple_granges(): c = """Chromosome Start End Strand Score chr1 3 6 + 5 chr1 8 9 + 1""" df = pd.read_table(StringIO(c), sep="\s+", header=0) return PyRanges(df)
def expected_result_counterexample5(): c = """chr2 0 1 + 1 2 + 1 chr2 2 3 + 1 3 + 0""" return PyRanges(pd.read_table(StringIO(c), sep="\s+", header=None, names="Chromosome Start End Strand Start_b End_b Strand_b Distance".split()))
def expected_result_regular_intersection(): c = """chr1 226987603 226987617 U0 0 + chr8 38747236 38747251 U0 0 - chr15 26105515 26105518 U0 0 +""" return PyRanges(pd.read_table(StringIO(c), header=None, names="Chromosome Start End Name Score Strand".split()))
def expected_result_same_stranded(): c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance 0 chr1 10241 10440 HWI-ST216_313:3:1302:4516:156396 1 - 9988 10187 HWI-ST216:427:D29R1ACXX:2:1205:6095:16532 1 - 55 1 chr1 110246 110445 HWI-ST216_313:3:1207:4315:142177 1 + 16109 16308 HWI-ST216:427:D29R1ACXX:2:2110:12286:25379 1 + 93939""" return PyRanges(pd.read_table(StringIO(c), sep=" "))
def expected_result_opposite_stranded(): c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance 0 chr1 10246 10445 HWI-ST216_313:3:1207:4315:142177 1 + 9988 10187 HWI-ST216:427:D29R1ACXX:2:1205:6095:16532 1 - 60 1 chr1 110246 110445 HWI-ST216_313:3:1207:4315:142177 1 + 19958 20157 HWI-ST216:427:D29R1ACXX:2:1313:6283:67310 1 - 90090""" return PyRanges(pd.read_table(StringIO(c), sep=" "))
def read_bed(f, output_df=False): columns = "Chromosome Start End Name Score Strand ThickStart ThickEnd ItemRGB BlockCount BlockSizes BlockStarts".split( ) first_start = open(f).readline().split()[1] header = None try: int(first_start) except ValueError: header = 0 df = pd.read_csv(f, dtype={ "Chromosome": "category", "Strand": "category" }, header=header, sep="\t") df.columns = columns[:df.shape[1]] if not output_df: return PyRanges(df) else: return df
def expected_result_overlap_same_strand(): c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b chr15 26105515 26105540 U0 0 + 26105493 26105518 U0 0 +""" df = pd.read_table(StringIO(c), header=0, sep="\s+") return PyRanges(df)
def read_bam(f, sparse=True, output_df=False, mapq=0, required_flag=0, filter_flag=1540): try: import bamread except ModuleNotFoundError as e: print( "bamread must be installed to read bam. Use `conda install -c bioconda bamread` or `pip install bamread` to install it." ) sys.exit(1) if sparse: df = bamread.read_bam(f, mapq, required_flag, filter_flag) else: try: df = bamread.read_bam_full(f, mapq, required_flag, filter_flag) except AttributeError: print( "bamread version 0.0.6 or higher is required to read bam non-sparsely." ) if output_df: return df else: return PyRanges(df)
def simple_gr2(): c = """Chromosome Start End Strand Score chr1 1 2 + 1 chr1 6 7 - 2""" df = pd.read_table(StringIO(c), sep="\s+", header=0) return PyRanges(df)
def expected_result_overlap_opposite_strand(): c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b chr8 38747226 38747251 U0 0 - 38747236 38747261 U0 0 + chr1 226987592 226987617 U0 0 + 226987603 226987628 U0 0 -""" return PyRanges(pd.read_table(StringIO(c), sep="\s+"))
def read_gtf_restricted(f, annotation=None, output_df=False, skiprows=0, nrows=None): """seqname - name of the chromosome or scaffold; chromosome names can be given with or without the 'chr' prefix. Important note: the seqname must be one used within Ensembl, i.e. a standard chromosome name or an Ensembl identifier such as a scaffold ID, without any additional content such as species or assembly. See the example GFF output below. # source - name of the program that generated this feature, or the data source (database or project name) feature - feature type name, e.g. Gene, Variation, Similarity start - Start position of the feature, with sequence numbering starting at 1. end - End position of the feature, with sequence numbering starting at 1. score - A floating point value. strand - defined as + (forward) or - (reverse). # frame - One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on.. attribute - A semicolon-separated list of tag-value pairs, providing additional information about each feature.""" dtypes = { "Chromosome": "category", "Feature": "category", "Strand": "category" } df_iter = pd.read_csv( f, sep="\t", comment="#", usecols=[0, 2, 3, 4, 5, 6, 8], header=None, names="Chromosome Feature Start End Score Strand Attribute".split(), dtype=dtypes, chunksize=int(1e5), nrows=nrows) dfs = [] for df in df_iter: # Since Start is 1-indexed df.Start -= 1 if sum(df.Score == ".") == len(df): cols_to_concat = "Chromosome Start End Strand Feature".split() else: cols_to_concat = "Chromosome Start End Strand Feature Score".split( ) extract = _fetch_gene_transcript_exon_id(df.Attribute, annotation) extract.columns = "gene_id transcript_id exon_number exon_id".split() extract.exon_number = extract.exon_number.astype(float) df = pd.concat([df[cols_to_concat], extract], axis=1, sort=False) dfs.append(df) df = pd.concat(dfs, sort=False) df.loc[:, "Start"] = df.Start - 1 if not output_df: return PyRanges(df) else: return df
def test_instantiation_without_strand(chip_10_plus_one): # mix series, lists and array seqnames = chip_10_plus_one.Chromosome.values starts = chip_10_plus_one.Start.values ends = chip_10_plus_one.End pr = PyRanges(seqnames=seqnames, starts=starts, ends=ends)
def expected_result_same_strand(): c = """Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b chr1 2 4 U0 0 + 1 9 U0 0 +""" df = pd.read_table(StringIO(c), header=0, sep="\s+") return PyRanges(df)
def load_dataset(basename): full_path = pkg_resources.resource_filename("pyranges", "example_data/{}.bed".format(basename)) df = pd.read_table(full_path, header=None, names="Chromosome Start End Name Score Strand".split()) return PyRanges(df)
def expected_result_regular_overlap_intersection(): c = """Chromosome Start End Start_a End_a Name_a Score_a Strand Start_b End_b Name_b Score_b Strand_b chr1 226987603 226987617 226987592 226987617 U0 0 + 226987603 226987628 U0 0 - chr8 38747236 38747251 38747226 38747251 U0 0 - 38747236 38747261 U0 0 + chr15 26105515 26105518 26105515 26105540 U0 0 + 26105493 26105518 U0 0 +""" return PyRanges(pd.read_table(StringIO(c), sep="\s+"))
def simple_gr1(): c = """Chromosome Start End Strand Score chr1 3 6 + 5 chr1 5 7 - 7 chr1 8 9 + 1""" df = pd.read_table(StringIO(c), sep="\s+", header=0) return PyRanges(df)
def simple_gr1(): c = """Chromosome Start End Score Strand chr1 3 6 5 + chr1 5 7 7 - chr1 8 9 1 +""" df = pd.read_table(StringIO(c), sep="\s+", header=0) return PyRanges(df)
def read_gff3(f, annotation=None, as_df=False, nrows=None, skiprows=0): """Read files in the General Feature Format. Parameters ---------- f : str Path to GFF file. as_df : bool, default False Whether to return as pandas DataFrame instead of PyRanges. nrows : int, default None Number of rows to read. Default None, i.e. all. See Also -------- pyranges.read_gtf : read files in the Gene Transfer Format """ dtypes = { "Chromosome": "category", "Feature": "category", "Strand": "category" } names = "Chromosome Source Feature Start End Score Strand Frame Attribute".split( ) df_iter = pd.read_csv(f, comment="#", sep="\t", header=None, names=names, dtype=dtypes, chunksize=int(1e5), skiprows=skiprows, nrows=nrows) dfs = [] for df in df_iter: extra = to_rows_gff3(df.Attribute.astype(str)) df = df.drop("Attribute", axis=1) ndf = pd.concat([df, extra], axis=1, sort=False) dfs.append(ndf) df = pd.concat(dfs, sort=False) df.loc[:, "Start"] = df.Start - 1 if not as_df: return PyRanges(df) else: return df
def test_instantiation_with_str_for_strands_seqnames(chip_10_plus_one): # mix series, lists and array seqnames = "chr1" starts = chip_10_plus_one.Start.values ends = chip_10_plus_one.End strands = "+" pr = PyRanges(seqnames=seqnames, starts=starts, ends=ends, strands=strands)
def read_bam(f, output_df=False, mapq=0, required_flag=0, filter_flag=1540): df = bamread.read_bam(f, mapq, required_flag, filter_flag) if output_df: return df else: return PyRanges(df) return bamread.read_bam(f, mapq, required_flag, filter_flag)
def background(): c = """Chromosome Start End Name Score Strand chr1 226987603 226987628 U0 0 - chr8 38747236 38747261 U0 0 + chr15 26105493 26105518 U0 0 +""" df = pd.read_table(StringIO(c), sep="\s+", header=0) print(df) return PyRanges(df)
def expected_result_self_unstranded(names): c = """chr1 9916 9988 HWI-ST216_313:3:1203:10227:6568 1 chr1 9939 9988 HWI-ST216_313:3:2301:15791:16298 1 chr1 9951 9988 HWI-ST216_313:3:2205:20086:33508 1 chr1 9953 9988 HWI-ST216_313:3:1305:6975:102491 1 chr1 9978 9988 HWI-ST216_313:3:1204:5599:113305 1""" df = pd.read_table(StringIO(c), sep="\s+", names=names[:-1], header=None) return PyRanges(df)
def chip(): c = """Chromosome Start End Name Score Strand chr1 226987592 226987617 U0 0 + chr8 38747226 38747251 U0 0 - chr15 26105515 26105540 U0 0 +""" df = pd.read_table(StringIO(c), sep="\s+", header=0) # print(df) return PyRanges(df)
def expected_result_no_strand_plus_one(names): c = """chr1 9916 9988 HWI-ST216_313:3:1203:10227:6568 1 - chr1 9939 9988 HWI-ST216_313:3:2301:15791:16298 1 + chr1 9951 9988 HWI-ST216_313:3:2205:20086:33508 1 - chr1 9953 9988 HWI-ST216_313:3:1305:6975:102491 1 + chr1 9978 9988 HWI-ST216_313:3:1204:5599:113305 1 - chr1 110246 110445 HWI-ST216_313:3:1207:4315:142177 1 +""" df = pd.read_table(StringIO(c), sep="\s+", names=names, header=None) return PyRanges(df)
def expected_result_counterexample13(): c = """chr1 3538885 3832293 + 0 1 + 3538885 chr1 4426346 9655531 + 0 1 + 4426346""" return PyRanges( pd.read_table( StringIO(c), sep="\s+", header=None, names="Chromosome Start End Strand Start_b End_b Strand_b Distance" .split()))
def expected_result_minus(): c = """0 chr1 10241 10440 HWI-ST216_313:3:1302:4516:156396 1 - 9988 10187 HWI-ST216:427:D29R1ACXX:2:1205:6095:16532 1 - 55""" return PyRanges( pd.read_table( StringIO(c), sep="\s+", header=None, names= "Chromosome Start End Name Score Strand Start_b End_b Name_b Score_b Strand_b Distance" .split()))