Beispiel #1
0
 def chr_filter(self, chrom):
     """
     chr_filter filters the gelist by the list of strings containing chromosomes that is given to it.
     The list chrom must contain strings in the same format as was in the gelist, ie 'chr1' or 'ctg1', not just the number.
     """
     chr_list = []
     for chr in chrom:
         if chr not in self.chromosomes:
             print "Warning: " + str(chr) + " does not exist here"
         else:
             chr_list.append(chr)
     bytes = [
         self.coord_to_index[a, b, c] for a, b, c in self.coord_to_index
         if a in chr_list
     ]
     #bytes is a list of lists; each list probably contains one byte inside, but just in case the function can
     #handle more than 1 byte in each list
     for byte in sorted(bytes):
         if len(byte) == 1:
             self.file.seek(byte[0])
             line = self.file.readline()
             if not line:
                 raise IndexError
             else:
                 yield parse_gff_line(line, format=self.format)
         else:
             for b in byte:
                 self.file.seek(b)
                 line = self.file.readline()
                 if not line:
                     raise IndexError
                 else:
                     yield parse_gff_line(line, format=self.format)
Beispiel #2
0
    def range_filter(self,
                     chrom,
                     start,
                     end,
                     contained=False,
                     fraction_query=False,
                     fraction_subject=False):
        """
        range_filter returns subsets of the gelist that match whose interval start and stop fit in the specified parameters.
        
        The default option requires only 1bp to overlap with the interval for it to be returned.
        The contained option forces that start and end of any returned subset to be completely contained within the query range.
        The fraction_query option calculates the fraction of overlapping bp / total number of bp in the query to return or not.
        The fraction_subject option calculated the fraction of overlapping bp/ total number of bp in the interval it wants to
            return to determine if that interval should be included. 
        """
        if contained != False:  #the interval must be completely contained within the query
            bytes = [
                self.coord_to_index[a, b, c] for a, b, c in self.coord_to_index
                if a == chrom and int(b) >= int(start) and int(c) <= int(end)
            ]
        elif fraction_query != False:  #the fraction of overlap/query length must be greater than parameter
            bytes = [
                self.coord_to_index[a, b, c] for a, b, c in self.coord_to_index
                if a == chrom and
                (getOverlap((float(start), float(end)), (float(b), float(c))) /
                 (float(end) - float(start))) >= float(fraction_query)
            ]
        elif fraction_subject != False:  #the fraction of overlap/gelist interval length must be greater than parameter
            bytes = [
                self.coord_to_index[a, b, c] for a, b, c in self.coord_to_index
                if a == chrom and
                (getOverlap((float(start), float(end)), (float(b), float(c))) /
                 (float(c) - float(b))) >= float(fraction_subject)
            ]
        else:  #default; there must be at least 1 bp overlap
            bytes = [
                self.coord_to_index[a, b, c] for a, b, c in self.coord_to_index
                if a == chrom and (int(b) >= int(start) or int(c) <= int(end))
            ]

        #bytes is a lists of lists of bytes. Each list should contain 1 byte, but there may be more
        for byte in sorted(bytes):
            if len(byte) == 1:
                self.file.seek(byte[0])
                line = self.file.readline()
                if not line:
                    raise IndexError
                else:
                    yield parse_gff_line(line, format=self.format)
            else:
                for b in byte:
                    self.file.seek(b)
                    line = self.file.readline()
                    if not line:
                        raise IndexError
                    else:
                        yield parse_gff_line(line, format=self.format)
Beispiel #3
0
 def next(self):
     """
     Parses the next line of the gelist and returns it. Used for the iterator.
     """
     line = self.file.readline()
     if not line:
         raise StopIteration
     else:
         return parse_gff_line(line, format=self.format)
Beispiel #4
0
 def coord_filter(self, chrom, start, end):
     """
     coord_filter returns subsets of the gelist that contain the exact match of chromosome, start, and end that is 
     given to the function. Start and end can be integers or strings.
     """
     try:
         bytes = self.coord_to_index[(chrom, str(start),
                                      str(end))]  #see if coordinate exists
     except KeyError:
         print "Warning: coordinate " + str(chrom) + ", start: " + str(start) + ", end: " + str(end) + \
               ", does not exist here"
         return
     for byte in sorted(bytes):  #go to line where coordinate exists
         self.file.seek(byte)
         line = self.file.readline()
         if not line:
             raise IndexError
         else:
             yield parse_gff_line(line, format=self.format)
Beispiel #5
0
 def feature_filter(self, features):
     """
     feature_filter filters the gelist by the third field in gff format and returns relevant dictionaries.
     """
     feature_list = []
     for feature in features:
         if feature not in self.features:
             print "Warning: " + str(feature) + " does not exist here"
         else:
             feature_list.append(
                 feature
             )  #list of wanted features that appear in this gelist
     for feature in feature_list:
         bytes = self.feature_to_byte[
             feature]  #list of bytes where the wanted features can be found
         for byte in sorted(bytes):
             self.file.seek(byte)
             line = self.file.readline()
             if not line:
                 raise IndexError
             else:
                 yield parse_gff_line(line, format=self.format)
Beispiel #6
0
 def sample_filter(self, samples):
     """
     Filtering functions:
     
     Sample filter returns parsed dictionaries of the relevant lines from the samples query.
     Samples is a list of strings that contain the sample names that you want to look for.
     """
     sample_list = []
     for sample in samples:
         if sample not in self.samples:
             print "Warning: " + str(sample) + " does not exist here"
         else:
             sample_list.append(sample)
     for sample in sample_list:
         bytes = self.sample_to_byte[
             sample]  #list of byte values where the lines are
         for byte in sorted(bytes):
             self.file.seek(byte)
             line = self.file.readline()
             if not line:
                 raise IndexError
             else:
                 yield parse_gff_line(line, format=self.format)
Beispiel #7
0
    def __init__(self,
                 filename,
                 from_string=False,
                 check_mc_values=False,
                 load_values=False,
                 format=False):
        """
        Initiates a gelist object by filling in all of the appropriate dictionaries of information on where to
        find certain coordinates, samples, features, or lines. A gelist also has a string indicating the format of 
        the file backing the gelist, a set of samples, a set of chromosomes, and a set of features that are present 
        in this gelist.
        """
        if from_string == True:
            self.file = cStringIO.StringIO(filename)
        else:
            try:
                self.file = open(filename, 'r')
            except:
                sys.exit("Cannot open file " + filename)

        self.chromosomes = set()  #A set of the chromosomes in the pybedtool
        self.samples = set()  #A set of the samples in the pybedtool
        self.coord_to_index = {
        }  #A hash of coords to their indexes in the pybedtool
        self.index_to_coord = {
        }  #hash of indexes in the pybedtool to the coordinates
        self.features = set()  #features present in this gelist
        self.format = ""  #format of the file backing the gelist
        self.values = {
        }  #hash of coords -> a list of lines that have those coords
        self.line_to_byte = {}  #hash of line number to bytes
        self.sample_to_byte = {}  #hash of sample to bytes
        self.feature_to_byte = {}  #hash of feature to bytes

        #find file type by first using extension, then number of fields. gff files have 9 fields
        if format != False:
            self.format = format
        elif (filename.rfind(".") == -1
              and from_string == False) or from_string == True:
            line = self.file.readline()
            if line.find(
                    "\t"
            ) != -1:  #check to see if it is tab-separated (most often)
                line_length = len(line.split("\t"))
            else:
                line_length = len(line.split(
                    " "))  #if it is not tab-separated, split by spaces
            if line_length == 9:
                self.format = "gff"
            else:
                self.format = "bed"
        else:
            self.format = filename.split(".")[-1]
        byte = 0  #byte index
        linenum = 0  #line number
        self.file.seek(0)  #go back to beginning
        while True:  #do not use file iterator, f.tell() doesn't work with iterator
            line = self.file.readline()
            if not line:
                break

            line = parse_gff_line(line, format=self.format)  #parse line
            self.chromosomes.add(line[0])
            self.line_to_byte[linenum] = byte
            #find format of file
            if self.format == "gff":
                self.samples.add(line[1])
                if line[1] not in self.sample_to_byte:
                    self.sample_to_byte[line[1]] = []
                self.sample_to_byte[line[1]].append(byte)

                self.features.add(line[2])
                if line[2] not in self.feature_to_byte:
                    self.feature_to_byte[line[2]] = []
                self.feature_to_byte[line[2]].append(byte)

                coord = (line[0], line[3], line[4])
            elif self.format == "bed":
                coord = (line[0], line[1], line[2])
            else:
                sys.exit(
                    "Unrecognized file argument of the gelist constructor")
            if coord not in self.coord_to_index:
                self.coord_to_index[coord] = []
            #add values to hashes
            self.coord_to_index[coord].append(byte)
            self.index_to_coord[byte] = coord

            if check_mc_values == True:
                #When you create a gelist double check that all the mC information is within a features coordinates
                #delete positions that are outside of it
                #Check for mc_positions field
                if "mc_positions" not in line:
                    sys.exit("Missing mc_positions field in: " + str(line))
                #This variable indicates whether or not mc_positions or stacks_positions are wrong
                changed = False
                mc_positions = line["mc_positions"].split(',')
                for index, position in enumerate(mc_positions[:]):
                    if int(position) < int(coord[1]) or int(position) > int(
                            coord[2]):
                        changed = True
                if changed == True:
                    sys.exit("mc values are incorrect on this line: " +
                             str(line))

                if "stacks_positions" not in line:
                    sys.exit("Missing stacks_positions field in: " + str(line))
                changed = False
                stacks_positions = line["stacks_positions"].split(',')
                for index, position in enumerate(stacks_positions[:]):
                    if int(position) < int(coord[1]) or int(position) > int(
                            coord[2]):
                        changed = True
                if changed == True:
                    sys.exit("stacks values are incorrect on this line: " +
                             str(line))

            #put in memory if necessary
            if load_values == True:
                if coord not in self.values:
                    self.values[coord] = []
                self.values[coord].append(line)
            byte = self.file.tell()
            linenum += 1
Beispiel #8
0
 def __getitem__(self, i):
     """
     Index into the gelist using the bracket operator. Finds the byte value of the appropriate line and returns the parsed
     dictionary. Slicing of the gelist is also supported.
     
     You can filter by specifying your arguments in the brackets gelist[("filter_type", arguments)]
     Sample filter requires a list of samples.
     Chromosome filter requires a list of chromosomes.
     Feature filter requires a list of features.
     Coordinate filter requires chrom, start, stop.
     Range filter requires at least chrom, start, stop, and there are 3 optional arguments: contained, fraction_query, or
         fraction_subject. If fraction_query or fraction_subject is chosen, then the next argument must be the decimal that
         you want to limit the filter by.
         ie -
         gelist[("range", chromosome, start, stop, [contained, fraction_query, fraction_subject], [fraction decimal])]
     """
     if isinstance(i, slice):
         indices = i.indices(len(self))  #returns (start, end, step) tuple
         return self.slice(indices)
     elif isinstance(i, tuple):
         if i[0] == "sample":
             if len(i) != 2 or not isinstance(i[1], list):
                 sys.exit(
                     "Filtering by sample requires 1 argument: a list of samples"
                 )
             else:
                 return self.sample_filter(i[1])
         elif i[0] == "chromosome":
             if len(i) != 2 or not isinstance(i[1], list):
                 sys.exit(
                     "Filtering by chromosome requires 1 argument: a list of chromosomes"
                 )
             else:
                 return self.chr_filter(i[1])
         elif i[0] == "feature":
             if len(i) != 2 or not isinstance(i[1], list):
                 sys.exit(
                     "Filtering by feature requires 1 argument: a list of features"
                 )
             else:
                 return self.feature_filter(i[1])
         elif i[0] == "coordinate":
             if len(i) != 4:
                 sys.exit(
                     "Filtering by coordinate requires 3 arguments: chromosome, start, stop"
                 )
             else:
                 return self.coord_filter(i[1], i[2], i[3])
         elif i[0] == "range":
             if len(i) < 4 or len(i) > 6:
                 sys.exit(
                     "Filtering by range requires at least 3 arguments: chromosome, start, stop."
                 )
             else:
                 if len(i) > 4:
                     if i[4] == "contained":
                         return self.range_filter(i[1],
                                                  i[2],
                                                  i[3],
                                                  contained=True)
                     elif i[4] == "fraction_query":
                         return self.range_filter(i[1],
                                                  i[2],
                                                  i[3],
                                                  fraction_query=i[5])
                     elif i[4] == "fraction_subject":
                         return self.range_filter(i[1],
                                                  i[2],
                                                  i[3],
                                                  fraction_subject=i[5])
                 else:
                     return self.range_filter(i[1], i[2], i[3])
     else:
         byte = self.line_to_byte[i]
         self.file.seek(byte)
         line = self.file.readline()
         if not line:
             raise IndexError
         else:
             return parse_gff_line(line, format=self.format)