def get_nominal_annotations(self):
     """Get nominal essentials and non-essentails in cerevisiae.
     
     Returns
     -------
     (set, set)
         A pair sets, denoting the essential and non-essential genes, using
         their standard names.
     """
     
     viable_filepath = Shared.get_dependency("cerevisiae", "cerevisiae_viable_annotations.txt")
     inviable_filepath = Shared.get_dependency("cerevisiae", "cerevisiae_inviable_annotations.txt")
     
     viable_table = pd.read_csv(viable_filepath, skiprows=8, delimiter="\t")
     inviable_table = pd.read_csv(inviable_filepath, skiprows=8, delimiter="\t")
     
     annotated_as_viable = set(self.feature_db.get_feature_by_name(f) for f in set(viable_table[viable_table["Mutant Information"] == "null"]["Gene"])) - set([None])
     annotated_as_inviable = set(self.feature_db.get_feature_by_name(f) for f in set(inviable_table[inviable_table["Mutant Information"] == "null"]["Gene"])) - set([None])
     
     # TODO: the dubious genes shouldn't be filtered here.
     consensus_viable_orfs = [f for f in annotated_as_viable if f.is_orf and f.feature_qualifier != "Dubious"]
     consensus_inviable_orfs = [f for f in annotated_as_inviable if f.is_orf and f.feature_qualifier != "Dubious"]
     
     return (set(f.standard_name for f in consensus_inviable_orfs),
             set(f.standard_name for f in consensus_viable_orfs))
def get_calb_orths_in_sp():
    pom_db = GenomicFeatures.default_pom_db()
    
    ortholog_table = pd.read_csv(Shared.get_dependency("albicans", "C_albicans_SC5314_S_pombe_orthologs.txt"),
                                 skiprows=8,
                                 delimiter='\t',
                                 header=None,
                                 usecols=['albicans standard name', 'pombe standard name'],
                                 names=['albicans standard name', 'albicans common name', 'albicans alb_db id',
                                        'pombe standard name', 'pombe common name', 'pombe alb_db id'])
    
    # TODO: we probably don't want to use the hit table, though the InParanoid
    # table is very stringent.
    best_hit_table = pd.read_csv(Shared.get_dependency("albicans", "C_albicans_SC5314_S_pombe_best_hits.txt"),
                                 skiprows=8,
                                 delimiter='\t',
                                 header=None,
                                 usecols=['albicans standard name', 'pombe standard name'],
                                 names=['albicans standard name', 'albicans common name', 'albicans alb_db id',
                                        'pombe standard name', 'pombe common name', 'pombe alb_db id'])
    
    joined_table = pd.concat([ortholog_table, best_hit_table])
    
    result = {}
    for alb_feature in GenomicFeatures.default_alb_db().get_all_features():
        ortholog_row = joined_table[joined_table["albicans standard name"] == alb_feature.standard_name]
        if ortholog_row.empty:
            continue
        
        pom_feature = pom_db.get_feature_by_name(ortholog_row["pombe standard name"].iloc[0])
        if pom_feature:
            result[alb_feature.standard_name] = pom_feature.name
    
    return result
 def _get_spom_essentials(self):
     viability_table = pd.read_csv(Shared.get_dependency("pombe/FYPOviability.tsv"),
                                   header=None,
                                   delimiter='\t',
                                   names=["pombe standard name", "essentiality"])
     
     return set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "inviable"), \
         set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "viable")
Beispiel #4
0
def get_calb_orths_in_sp():
    pom_db = GenomicFeatures.default_pom_db()

    ortholog_table = pd.read_csv(
        Shared.get_dependency("albicans",
                              "C_albicans_SC5314_S_pombe_orthologs.txt"),
        skiprows=8,
        delimiter='\t',
        header=None,
        usecols=['albicans standard name', 'pombe standard name'],
        names=[
            'albicans standard name', 'albicans common name',
            'albicans alb_db id', 'pombe standard name', 'pombe common name',
            'pombe alb_db id'
        ])

    # TODO: we probably don't want to use the hit table, though the InParanoid
    # table is very stringent.
    best_hit_table = pd.read_csv(
        Shared.get_dependency("albicans",
                              "C_albicans_SC5314_S_pombe_best_hits.txt"),
        skiprows=8,
        delimiter='\t',
        header=None,
        usecols=['albicans standard name', 'pombe standard name'],
        names=[
            'albicans standard name', 'albicans common name',
            'albicans alb_db id', 'pombe standard name', 'pombe common name',
            'pombe alb_db id'
        ])

    joined_table = pd.concat([ortholog_table, best_hit_table])

    result = {}
    for alb_feature in GenomicFeatures.default_alb_db().get_all_features():
        ortholog_row = joined_table[joined_table["albicans standard name"] ==
                                    alb_feature.standard_name]
        if ortholog_row.empty:
            continue

        pom_feature = pom_db.get_feature_by_name(
            ortholog_row["pombe standard name"].iloc[0])
        if pom_feature:
            result[alb_feature.standard_name] = pom_feature.name

    return result
Beispiel #5
0
def analyze_deletions(bam_file, threshold=50):
    bam_reader = pysam.AlignmentFile(bam_file, "rb")

    fasta_file = Shared.get_dependency(
        os.path.join(
            "albicans", "reference genome",
            "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta")
    )
    chrom_names = []
    chrom_lens = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        chrom_names.append(record.id)
        chrom_lens[record.id] = RangeSet([(1, len(record))])

    seen = {chrom: [] for chrom in chrom_names}

    for read in bam_reader.fetch():
        chrom_name = bam_reader.getrname(read.reference_id)
        if "chrM" in chrom_name:
            continue
        seen[chrom_name].append(
            (read.reference_start + 1, read.reference_end - 1 + 1))

    unseen = {
        chrom: chrom_lens[chrom] - RangeSet(seen[chrom])
        for chrom in chrom_names
    }
    write_ranges(
        unseen,
        "/Users/bermanlab/dev/transposon-pipeline/dependencies/albicans/deleted_regions.csv"
    )
    ranges = {
        chrom: [r for r in unseen[chrom] if r[1] - r[0] >= threshold]
        for chrom in chrom_names
    }

    pprint(ranges)

    print "Total unseen:", sum(r.coverage for r in unseen.values())
    print "Total filtered unseen:", sum(
        sum(r[1] - r[0] + 1 for r in rs) for rs in ranges.values())

    for chrom in chrom_names:
        print chrom
        print "Total subranges:", len(unseen[chrom])
        print "Total length:", unseen[chrom].coverage
        print "Ignored long subranges:", len(ranges[chrom])
        print "Total length:", sum(r[1] - r[0] + 1 for r in ranges[chrom])
        print "\n"

        import GenomicFeatures
        alb_db = GenomicFeatures.default_alb_db()
        for r in unseen[chrom]:
            fs = alb_db.get_features_at_range(chrom, r)
            if fs:
                print chrom, r, ", ".join(f.standard_name for f in fs)

        print "\n"
Beispiel #6
0
    def _get_spom_essentials(self):
        viability_table = pd.read_csv(
            Shared.get_dependency("pombe/FYPOviability.tsv"),
            header=None,
            delimiter='\t',
            names=["pombe standard name", "essentiality"])

        return set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "inviable"), \
            set(r[0] for _ix, r in viability_table.iterrows() if r[1] == "viable")
Beispiel #7
0
    def _get_homologous_regions(self):
        ranges = self._read_range_data(
            Shared.get_dependency(
                os.path.join("albicans", "homologous_regions.csv")))

        return {
            chrom:
            RangeSet(r for r in range_set
                     if r[1] - r[0] + 1 >= self._ignore_region_threshold)
            for chrom, range_set in ranges.iteritems()
        }
Beispiel #8
0
    def get_nominal_annotations(self):
        """Get nominal essentials and non-essentails in cerevisiae.
        
        Returns
        -------
        (set, set)
            A pair sets, denoting the essential and non-essential genes, using
            their standard names.
        """

        viable_filepath = Shared.get_dependency(
            "cerevisiae", "cerevisiae_viable_annotations.txt")
        inviable_filepath = Shared.get_dependency(
            "cerevisiae", "cerevisiae_inviable_annotations.txt")

        viable_table = pd.read_csv(viable_filepath, skiprows=8, delimiter="\t")
        inviable_table = pd.read_csv(inviable_filepath,
                                     skiprows=8,
                                     delimiter="\t")

        annotated_as_viable = set(
            self.feature_db.get_feature_by_name(f)
            for f in set(viable_table[viable_table["Mutant Information"] ==
                                      "null"]["Gene"])) - set([None])
        annotated_as_inviable = set(
            self.feature_db.get_feature_by_name(f)
            for f in set(inviable_table[inviable_table["Mutant Information"] ==
                                        "null"]["Gene"])) - set([None])

        # TODO: the dubious genes shouldn't be filtered here.
        consensus_viable_orfs = [
            f for f in annotated_as_viable
            if f.is_orf and f.feature_qualifier != "Dubious"
        ]
        consensus_inviable_orfs = [
            f for f in annotated_as_inviable
            if f.is_orf and f.feature_qualifier != "Dubious"
        ]

        return (set(f.standard_name for f in consensus_inviable_orfs),
                set(f.standard_name for f in consensus_viable_orfs))
def analyze_deletions(bam_file, threshold=50):
    bam_reader = pysam.AlignmentFile(bam_file, "rb")
    
    fasta_file = Shared.get_dependency(os.path.join("albicans", "reference genome", "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta"))
    chrom_names = []
    chrom_lens = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        chrom_names.append(record.id)
        chrom_lens[record.id] = RangeSet([(1, len(record))])
    
    seen = {chrom: [] for chrom in chrom_names}
    
    for read in bam_reader.fetch():
        chrom_name = bam_reader.getrname(read.reference_id)
        if "chrM" in chrom_name:
            continue
        seen[chrom_name].append((read.reference_start+1, read.reference_end-1+1))
            
    unseen = {chrom: chrom_lens[chrom] - RangeSet(seen[chrom]) for chrom in chrom_names}
    write_ranges(unseen, "/Users/bermanlab/dev/transposon-pipeline/dependencies/albicans/deleted_regions.csv")
    ranges = {chrom: [r for r in unseen[chrom] if r[1] - r[0] >= threshold] for chrom in chrom_names}
    
    pprint(ranges)
    
    print "Total unseen:", sum(r.coverage for r in unseen.values())
    print "Total filtered unseen:", sum(sum(r[1]-r[0]+1 for r in rs) for rs in ranges.values())
    
    for chrom in chrom_names:
        print chrom
        print "Total subranges:", len(unseen[chrom])
        print "Total length:", unseen[chrom].coverage
        print "Ignored long subranges:", len(ranges[chrom])
        print "Total length:", sum(r[1]-r[0]+1 for r in ranges[chrom])
        print "\n"
        
        import GenomicFeatures
        alb_db = GenomicFeatures.default_alb_db()
        for r in unseen[chrom]:
            fs = alb_db.get_features_at_range(chrom, r)
            if fs:
                print chrom, r, ", ".join(f.standard_name for f in fs)
                
        print "\n"
Beispiel #10
0
                                     (Adaptor cleaning works the same as with R1.)
   -d  --delete-originals           Delete input FASTQ files.
   -k  --keep-clean-fqs             Keep the cleaned FASTQ files.
   -p  --primer-check               Check primer specificity if percent transposon in reads is low.
   -h  --help                       Show this help message and exit 
'''

TnPrimerAndTail = 'GTATTTTACCGACCGTTACCGACCGTTTTCATCCCTA'
TnRev = 'TAGGGATGAAAACGGTCGGTAACGGTCGGTAAAATAC'
PrimerOnly = 'GTATTTTACCGACCGTTACCGACC'
PrimerRev = 'GGTCGGTAACGGTCGGTAAAATAC'
AdapterSeq = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'

# NB: bowtie2 requires spaces to be escapes with a backslash for the -x parameter.
CInd = Shared.get_dependency(
    "albicans", "reference genome",
    "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA").replace(
        ' ', '\ ')
CORES = 4  # Cores on the machine = how many threads should the external tools utilize


def GetCmdPath(Program):
    """Gets the path to a desired program file on given computer.

    Parameters
    ----------
        Program :   string  
            Name of program wish to have path for.

    Returns
    -------
        CmdPath :   string
import Shared


					
usage = '''CreateHitFile.py  
   -i  --in-dir     [str]   Input directory with .bam files to parse. Defaults to current directory if left unspecified.
   -o  --out-dir    [str]   Output directory to which the hit file will be writen. Defaults to current directory if left unspecified.
   -q  --min-mapq   [int]   Map Quality - hits to parse from the bam file (default is 20)
   -m  --merge-dist [int]   Hits to merge with at most x nt distance between two hits. Default is 2 
                                Example: Hits in positions 1 and 3  (3-1=2) will be united into a single hit
   -h  --help               Show this help message and exit 
'''


# TODO: move to config file.
ChrFile = Shared.get_dependency('albicans', 'reference genome', 'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta')
FeatureFName = Shared.get_dependency('albicans', 'reference genome', 'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomal_feature.tab')

ChrFeatCols = ['FeatureName', 'GeneName','Aliases','FeatureType','Chromosome','StartCoord','StopCoord','Strand','PrimaryCGDID','SecondaryCGDID',\
        'Description','DateCreated','SeqCoordVerDate','Blank1','Blank2','GeneNameReserDate','ReservedIsstandardName','SC_ortholog']

def FindHitsPerSample(SamAlign, ChrFeatMap, Sep0N = 2,MapQ=10):
    """Goes through Sam file, checks for high confidence alignment, unites unique positions if they can be aligned with adjunct positions.

    Parameters
    ----------
        SamAlign    :   x
        ChrFeatMap  :   x
        SepON   :   integer
        MapQ    :   integer
            Setting for map quality. Default of 10 is equal to 1% chance of occurring in another position.
 def _get_deleted_regions(self):
     ranges = self._read_range_data(Shared.get_dependency(os.path.join("albicans", "deleted_regions.csv")))
 
     return {chrom: RangeSet(r for r in range_set if r[1] - r[0] + 1 >= self._ignore_region_threshold) for chrom, range_set in ranges.iteritems()}
Beispiel #13
0
import pysam
import itertools
import Shared

usage = '''CreateHitFile.py  
   -i  --in-dir     [str]   Input directory with .bam files to parse. Defaults to current directory if left unspecified.
   -o  --out-dir    [str]   Output directory to which the hit file will be writen. Defaults to current directory if left unspecified.
   -q  --min-mapq   [int]   Map Quality - hits to parse from the bam file (default is 20)
   -m  --merge-dist [int]   Hits to merge with at most x nt distance between two hits. Default is 2 
                                Example: Hits in positions 1 and 3  (3-1=2) will be united into a single hit
   -h  --help               Show this help message and exit 
'''

# TODO: move to config file.
ChrFile = Shared.get_dependency(
    'albicans', 'reference genome',
    'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA.fasta')
FeatureFName = Shared.get_dependency(
    'albicans', 'reference genome',
    'C_albicans_SC5314_version_A22-s07-m01-r08_chromosomal_feature.tab')

ChrFeatCols = ['FeatureName', 'GeneName','Aliases','FeatureType','Chromosome','StartCoord','StopCoord','Strand','PrimaryCGDID','SecondaryCGDID',\
        'Description','DateCreated','SeqCoordVerDate','Blank1','Blank2','GeneNameReserDate','ReservedIsstandardName','SC_ortholog']


def FindHitsPerSample(SamAlign, ChrFeatMap, Sep0N=2, MapQ=10):
    """Goes through Sam file, checks for high confidence alignment, unites unique positions if they can be aligned with adjunct positions.

    Parameters
    ----------
        SamAlign    :   x
 def _get_genes_with_paralogs(self):
     return Organism._get_genes_with_paralogs(self, Shared.get_dependency(os.path.join("pombe", "hasParalogs_sp.txt")))
Beispiel #15
0
 def _get_genes_with_paralogs(self):
     return Organism._get_genes_with_paralogs(
         self,
         Shared.get_dependency(
             os.path.join("albicans", "hasParalogs_ca.txt")))
 def _get_homologous_regions(self):
     ranges = self._read_range_data(Shared.get_dependency(os.path.join("cerevisiae", "homologous_regions.csv")))
 
     return {chrom: RangeSet(r for r in range_set if r[1] - r[0] + 1 >= self._ignore_region_threshold) for chrom, range_set in ranges.iteritems()}
 def _get_genes_with_paralogs(self):
     return Organism._get_genes_with_paralogs(self, Shared.get_dependency(os.path.join("albicans", "hasParalogs_ca.txt")))
   -r  --reverse-strand             Search with reverse complement sequence for R2 files.
                                     (Adaptor cleaning works the same as with R1.)
   -d  --delete-originals           Delete input FASTQ files.
   -k  --keep-clean-fqs             Keep the cleaned FASTQ files.
   -p  --primer-check               Check primer specificity if percent transposon in reads is low.
   -h  --help                       Show this help message and exit 
'''

TnPrimerAndTail = 'GTATTTTACCGACCGTTACCGACCGTTTTCATCCCTA'
TnRev = 'TAGGGATGAAAACGGTCGGTAACGGTCGGTAAAATAC'
PrimerOnly = 'GTATTTTACCGACCGTTACCGACC'
PrimerRev = 'GGTCGGTAACGGTCGGTAAAATAC'
AdapterSeq = 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC'

# NB: bowtie2 requires spaces to be escapes with a backslash for the -x parameter.
CInd = Shared.get_dependency("albicans", "reference genome", "C_albicans_SC5314_version_A22-s07-m01-r08_chromosomes_HapA").replace(' ', '\ ')
CORES = 4 # Cores on the machine = how many threads should the external tools utilize

def GetCmdPath(Program):
    """Gets the path to a desired program file on given computer.

    Parameters
    ----------
        Program :   string  
            Name of program wish to have path for.

    Returns
    -------
        CmdPath :   string
            Path for calling program as a command in the POSIX terminal.
    """
Beispiel #19
0
 def _get_genes_with_paralogs(self):
     return Organism._get_genes_with_paralogs(
         self,
         Shared.get_dependency(os.path.join("pombe", "hasParalogs_sp.txt")))