Beispiel #1
0
 def __init__(self, autosql, parent=None, delim=""):
     """Create an |AutoSqlField|
     
     Parameters
     ----------
     autosql : str
         Block of autoSql text specifying format of element
         
     parent : instance of subclass of |AbstractAutoSqlObject| or None, optional
         Parent / enclosing element. Default: None
     
     delim : str, optional
         Field delimiter (default: tab)
     """
     AbstractAutoSqlElement.__init__(self,
                                     autosql,
                                     parent=parent,
                                     delim=delim)
     type_ = self.attr["type"]
     try:
         self.formatter = self.field_types[type_][0]
     except KeyError:
         try:
             self.formatter = self.parent.field_types[type_][0]
         except:
             self.formatter = str
             warn(
                 "Could not find formatter for field '%s' of type '%s'. Casting to 'string' instead."
                 % (self.attr["name"], type_), DataWarning)
Beispiel #2
0
    def _assemble(self, line):
        """Read `BED`_ files line-by-line into types specified by `self.return_type`"""
        self.counter += 1
        if line.strip() == "":
            return self.__next__()
        elif line.startswith("browser"):
            return self.__next__()
        elif line.startswith("track"):
            # reset metadata
            self._parse_track_line(line[5:])
            return self.__next__()
        elif line.startswith("#"):
            return self.__next__()
        else:
            try:
                return self.return_type.from_bed(line, extra_columns=self.extra_columns)
            except:
                self.rejected.append(line)
                msg = "Cannot parse BED line number %s. " % self.counter
                if self.metadata.get("type", None) is not None:
                    msg += "Are you sure this is a %s BED file with extra columns (%s)?" % (
                        self.metadata.get("type"),
                        self._get_extra_column_names(),
                    )
                elif self.extra_columns != 0:
                    msg += "Are you sure this BED file has extra columns (%s)?" % self._get_extra_column_names()
                else:
                    msg += "Maybe this BED has extra columns (i.e. is an extended BED file)?"

                msg += "\n    %s" % line
                warn(msg, FileFormatWarning)
                return self.__next__()
Beispiel #3
0
    def _assemble(self, line):
        """Read `BED`_ files line-by-line into types specified by `self.return_type`"""
        self.counter += 1
        if line.strip() == "":
            return self.__next__()
        elif line.startswith("browser"):
            return self.__next__()
        elif line.startswith("track"):
            # reset metadata
            self._parse_track_line(line[5:])
            return self.__next__()
        elif line.startswith("#"):
            return self.__next__()
        else:
            try:
                return self.return_type.from_bed(
                    line, extra_columns=self.extra_columns)
            except:
                self.rejected.append(line)
                msg = "Cannot parse BED line number %s. " % self.counter
                if self.metadata.get("type", None) is not None:
                    msg += (
                        "Are you sure this is a %s BED file with extra columns (%s)?"
                        % (self.metadata.get("type"),
                           self._get_extra_column_names()))
                elif self.extra_columns != 0:
                    msg += (
                        "Are you sure this BED file has extra columns (%s)?" %
                        self._get_extra_column_names())
                else:
                    msg += "Maybe this BED has extra columns (i.e. is an extended BED file)?"

                msg += ("\n    %s" % line)
                warn(msg, FileFormatWarning)
                return self.__next__()
Beispiel #4
0
def parse_GFF3_tokens(inp,list_types=None):
    """Helper function to parse tokens in the final column of a `GFF3`_ file
    into a dictionary of attributes. Because, the following attributes are
    permitted to have multiple values in the `GFF3`_ spec, their values, if present
    are returned as lists in the dictionary rather than strings:
    
        - `Parent`
        - `Alias`
        - `Note`
        - `Dbxref`
        - `Ontology_term`
 
    All values are unescaped folowing the `GFF3`_ specification.
 
    Examples
    --------
        >>> tokens = 'a=1;c=3;b=2;e=5;d=4;z=26,Parents=gene01'
        >>> parse_GFF3_tokens(tokens)
        {'a': '1', 'c': '3', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01'] }

        >>> tokens = 'a=1;c=3,7;b=2;e=5;d=4;z=26,Parents=gene01,gene02'
        >>> parse_GFF3_tokens(tokens)
        {'a': '1', 'c': '3,7', 'b': '2', 'e': '5', 'd': '4', 'z': '26', 'parents' : ['gene01','gene02']}

 
    Parameters
    ----------
    inp : str
        Ninth column of `GFF3`_ entry
    
    list_types : list, optional
        Names of attributes that should be returned as lists
        (Default: %s)
         
    Returns
    -------
    dict : key-value pairs
    """ % ",".join(_GFF3_DEFAULT_LISTS)
    if list_types is None:
        list_types = _GFF3_DEFAULT_LISTS

    d = {}
    items = inp.strip("\n").strip(";").split(";")
    for item in items:
        if len(item) > 0:
            key, val = item.split("=")
            key = unescape_GFF3(key.strip(" "))
            if key in list_types:
                val = [unescape_GFF3(X) for X in val.strip(" ").split(",")]
            else:
                val = unescape_GFF3(val.strip(" "))
                
            if key in d:
                warn("Found duplicate attribute key '%s' in GFF3 line. Catenating value with previous value for key in attr dict:\n    %s" % (key,inp),
                     FileFormatWarning)
                val = "%s,%s" % (d[key],val)
            d[key] = val
    return d
Beispiel #5
0
    def _parse_fields(self):
        """Parse fields of an autoSql declaration, and populate
        ``self.field_formatters`` and ``self.field_comments``.
        """
        # order in which we try to match autoSql fields
        match_order = [AutoSqlField, SizedAutoSqlField, ValuesAutoSqlField]

        # fields are area of string from last starting point to end of comment
        # first starting point is 0;all subsequent starting points will be end
        # of previous comment

        _, comment_locs = self.mask_comments(self._field_text)
        last_index = 0
        for (_, next_index) in comment_locs:
            field_str = self._field_text[last_index:next_index + 1]
            for field_class in match_order:
                if field_class.matches(field_str):
                    my_parser = field_class(field_str)
                    name = my_parser.attr["name"]
                    if name in self.field_formatters:
                        oldname = name
                        i = 1
                        current_formatter = self.field_formatters[name]
                        current_type = current_formatter.attr.get(
                            "type", current_formatter.__class__.__name__)
                        new_type = my_parser.attr.get(
                            "type", my_parser.__class__.__name__)
                        while name in self.field_formatters:
                            i += 1
                            name = "%s%s" % (oldname, i)
                            warn(
                                "Element named '%s' of type '%s' already found in autoSql declaration '%s.' Renaming current element of type '%s' to '%s'"
                                %
                                (oldname, current_type,
                                 self.attr.get("name", "unnamed declaration"),
                                 new_type, name), DataWarning)
                        my_parser.attr["name"] = name

                    self.field_formatters[name] = my_parser
                    self.field_comments[name] = my_parser.attr["comment"]

            last_index = next_index + 1
Beispiel #6
0
 def __call__(self, text, rec=None):
     """Parse an value matching the field described by ``self.autosql``
     from a block of delimited text
     
     Parameters
     ----------
     text : str
         Multiline text block, formatted in autoSql
     
     Returns
     -------
     Value or object of appropriate type
     """
     try:
         return self.formatter(text)
     except ValueError:
         message = "Could not convert autoSql value '%s' for field '%s' to type '%s'. Casting to 'string' instead. " % (
             text, self.attr["name"], self.formatter.__name__)
         warn(message, DataWarning)
         return text
Beispiel #7
0
 def _assemble(self,line):
     """Read `PSL`_ files line-by-line into types specified by ``self.return_type``"""
     self.counter += 1
     if line.strip() == "":
         return self.__next__()
     elif line.startswith("psLayout"):
         return self.__next__()
     elif line.lstrip().startswith("match"):
         return self.__next__()
     elif line.startswith("--"):
         return self.__next__()
     elif line.startswith("#"):
         return self.__next__()        
     else:
         try:
             return self.return_type.from_psl(line)
         except Exception as e:
             self.rejected.append(line)
             warn("Rejecting line %s because of %s: %s" %
                     (self.counter,e.message,line),FileFormatWarning)
             return self.__next__()        
Beispiel #8
0
 def __call__(self,text,rec=None):
     """Parse an value matching the field described by ``self.autosql``
     from a block of delimited text
     
     Parameters
     ----------
     text : str
         Multiline text block, formatted in autoSql
     
     Returns
     -------
     Value or object of appropriate type
     """
     try:
         return self.formatter(text)
     except ValueError:
         message = "Could not convert autoSql value '%s' for field '%s' to type '%s'. Casting to 'string' instead. " % (text,
                                                                                                                        self.attr["name"],
                                                                                                                        self.formatter.__name__)
         warn(message,DataWarning) 
         return text
Beispiel #9
0
    def _parse_fields(self):
        """Parse fields of an autoSql declaration, and populate
        ``self.field_formatters`` and ``self.field_comments``.
        """
        # order in which we try to match autoSql fields        
        match_order = [AutoSqlField,SizedAutoSqlField,ValuesAutoSqlField]

        # fields are area of string from last starting point to end of comment
        # first starting point is 0;all subsequent starting points will be end 
        # of previous comment
        
        _, comment_locs = self.mask_comments(self._field_text)
        last_index = 0
        for (_,next_index) in comment_locs:
            field_str = self._field_text[last_index:next_index+1]
            for field_class in match_order:
                if field_class.matches(field_str):
                    my_parser = field_class(field_str)
                    name      = my_parser.attr["name"]
                    if name in self.field_formatters:
                        oldname = name
                        i = 1
                        current_formatter = self.field_formatters[name]
                        current_type = current_formatter.attr.get("type",current_formatter.__class__.__name__) 
                        new_type = my_parser.attr.get("type",my_parser.__class__.__name__) 
                        while name in self.field_formatters:
                            i += 1
                            name = "%s%s" % (oldname,i)
                            warn("Element named '%s' of type '%s' already found in autoSql declaration '%s.' Renaming current element of type '%s' to '%s'" % (oldname,
                                                                                                                                                               current_type,
                                                                                                                                                               self.attr.get("name","unnamed declaration"),
                                                                                                                                                               new_type,
                                                                                                                                                               name),
                                      DataWarning)
                        my_parser.attr["name"] = name
                        
                    self.field_formatters[name]  = my_parser
                    self.field_comments[  name]  = my_parser.attr["comment"]
            
            last_index = next_index+1
Beispiel #10
0
    def __call__(self, text, rec=None):
        """Parse an value matching the field described by ``self.autosql``
        from a block of delimited text
        
        Parameters
        ----------
        text : str
            Multiline text block, formatted in autoSql

        rec : OrderedDict or None, optional
            Record whose attributes are being populated by recursive
            processing of ``text``. Passed in cases where fields sized by variables
            need to look up instance values of earlier fields to evaluate those
            variables.
        
        Returns
        -------
        tuple
            Tuple of appropriate type
        """
        if self.formatter != str:
            try:
                retval = tuple([
                    self.formatter(X)
                    for X in text.strip().strip(self.delim).split(self.delim)
                ])
            except ValueError:
                message = "Could not convert autoSql value '%s' in field '%s' to tuple of type '%s'. Leaving as str " % (
                    text, self.attr["name"], self.formatter.__name__)
                warn(message, DataWarning)
                return text
        else:
            retval = text

        if self.attr["size_is_int"] == True:
            assert len(retval) == self.attr["size"]
        else:
            assert len(retval) == rec[self.attr["size"]]

        return retval
Beispiel #11
0
    def _parse_track_line(self, inp):
        """Parse track line from `BED`_ / extended BED file
        
        Parameters
        ----------
        inp : str
            track definition line from `BED`_  / extended BED file

        Returns
        -------
        dict
            key-value pairs from `BED`_ line
        """
        self.metadata = {}
        ltmp = shlex.split(inp.strip("\n"))
        for item in ltmp:
            k, v = item.split("=")
            self.metadata[k] = v

        track_type = self.metadata.get("type", None)
        if track_type is not None:
            if track_type in bed_x_formats:
                self.printer.write(
                    "Found track type '%s' in track definition line. Assuming extra columns follow UCSC definitions."
                    % track_type
                )
                if self.extra_columns == 0:
                    self.extra_columns = bed_x_formats[track_type]
                elif self.extra_columns != bed_x_formats[track_type]:
                    my_columns = self._get_extra_column_names()
                    track_format_columns = ",".join([X[0] for X in bed_x_formats[track_type]])
                    warn(
                        "Extra columns specified by %s track type declaration (%s) don't match those specified by user (%s). Using those specified by user."
                        % (track_type, track_format_columns, my_columns),
                        FileFormatWarning,
                    )
                    self.metadata["type"] = "custom"
            else:
                self.printer.write("Found track type '%s' in track definition line." % track_type)
Beispiel #12
0
 def _assemble(self, line):
     """Read `PSL`_ files line-by-line into types specified by ``self.return_type``"""
     self.counter += 1
     if line.strip() == "":
         return self.__next__()
     elif line.startswith("psLayout"):
         return self.__next__()
     elif line.lstrip().startswith("match"):
         return self.__next__()
     elif line.startswith("--"):
         return self.__next__()
     elif line.startswith("#"):
         return self.__next__()
     else:
         try:
             return self.return_type.from_psl(line)
         except Exception as e:
             self.rejected.append(line)
             warn(
                 "Rejecting line %s because of %s: %s" %
                 (self.counter, e.message, line), FileFormatWarning)
             return self.__next__()
Beispiel #13
0
    def _parse_track_line(self, inp):
        """Parse track line from `BED`_ / extended BED file
        
        Parameters
        ----------
        inp : str
            track definition line from `BED`_  / extended BED file

        Returns
        -------
        dict
            key-value pairs from `BED`_ line
        """
        self.metadata = {}
        ltmp = shlex.split(inp.strip("\n"))
        for item in ltmp:
            k, v = item.split("=")
            self.metadata[k] = v

        track_type = self.metadata.get("type", None)
        if track_type is not None:
            if track_type in bed_x_formats:
                self.printer.write(
                    "Found track type '%s' in track definition line. Assuming extra columns follow UCSC definitions."
                    % track_type)
                if self.extra_columns == 0:
                    self.extra_columns = bed_x_formats[track_type]
                elif self.extra_columns != bed_x_formats[track_type]:
                    my_columns = self._get_extra_column_names()
                    track_format_columns = ",".join(
                        [X[0] for X in bed_x_formats[track_type]])
                    warn("Extra columns specified by %s track type declaration (%s) don't match those specified by user (%s). Using those specified by user." %\
                         (track_type,track_format_columns,my_columns),FileFormatWarning)
                    self.metadata["type"] = "custom"
            else:
                self.printer.write(
                    "Found track type '%s' in track definition line." %
                    track_type)
Beispiel #14
0
    def __call__(self,text,rec=None):
        """Parse an value matching the field described by ``self.autosql``
        from a block of delimited text
        
        Parameters
        ----------
        text : str
            Multiline text block, formatted in autoSql

        rec : OrderedDict or None, optional
            Record whose attributes are being populated by recursive
            processing of ``text``. Passed in cases where fields sized by variables
            need to look up instance values of earlier fields to evaluate those
            variables.
        
        Returns
        -------
        tuple
            Tuple of appropriate type
        """
        if self.formatter != str:
            try:
                retval = tuple([self.formatter(X) for X in text.strip().strip(self.delim).split(self.delim)])
            except ValueError:
                message = "Could not convert autoSql value '%s' in field '%s' to tuple of type '%s'. Leaving as str " % (text,
                                                                                                                         self.attr["name"],
                                                                                                                         self.formatter.__name__)
                warn(message,DataWarning) 
                return text
        else:
            retval = text
        
        if self.attr["size_is_int"] == True:    
            assert len(retval) == self.attr["size"]
        else:
            assert len(retval) == rec[self.attr["size"]]
        
        return retval
Beispiel #15
0
 def __init__(self,autosql,parent=None,delim=""):
     """Create an |AutoSqlField|
     
     Parameters
     ----------
     autosql : str
         Block of autoSql text specifying format of element
         
     parent : instance of subclass of |AbstractAutoSqlObject| or None, optional
         Parent / enclosing element. Default: None
     
     delim : str, optional
         Field delimiter (default: tab)
     """        
     AbstractAutoSqlElement.__init__(self,autosql,parent=parent,delim=delim)
     type_ = self.attr["type"]
     try:
         self.formatter = self.field_types[type_][0]
     except KeyError:
         try:
             self.formatter = self.parent.field_types[type_][0]
         except:
             self.formatter = str
             warn("Could not find formatter for field '%s' of type '%s'. Casting to 'string' instead." % (self.attr["name"],type_),DataWarning)
Beispiel #16
0
def parse_GTF2_tokens(inp):
    """Helper function to parse tokens in the final column of a `GTF2`_ file
    into a dictionary of attributes. All attributes are returned as strings,
    and are unescaped if GFF escape sequences (e.g. *'%2B'*) are present.

    If duplicate keys are present (e.g. as in GENCODE `GTF2`_ files),
    their values are catenated, separated by a comma.
    
    Examples
    --------
        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript";'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'}
    
        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'transcript_id' : 'mytranscript'}
    
        >>> tokens = 'gene_id "mygene;"; transcript_id "myt;ranscript"'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene;', 'transcript_id' : 'myt;ranscript'}
    
        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value";'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'tag' : 'tag value', 'transcript_id' : 'mytranscript'}

        >>> tokens = 'gene_id "mygene"; transcript_id "mytranscript"; tag "tag value"; tag "tag value 2";'
        >>> parse_GTF2_tokens(tokens)
        {'gene_id' : 'mygene', 'tag' : 'tag value,tag value 2', 'transcript_id' : 'mytranscript'}



    Parameters
    ----------
    inp : str
        Ninth column of `GTF2`_ entry
        
    Returns
    -------
    dict : key-value pairs
    """
    d = {}
    items = shlex.split(inp.strip("\n"))
    assert len(items) % 2 == 0
    for i in range(0,len(items),2):
        key = unescape_GTF2(items[i])
        val = items[i+1]
        # require separation by semicolons for all but final token
        if i+1 < len(items) - 2:
            assert val.endswith(";")
        
        if val.endswith(";"):
            val = val[:-1]

        if key in d:
            warn("Found duplicate attribute key '%s' in GTF2 line. Catenating value with previous value for key in attr dict:\n    %s" % (key,inp),
                 FileFormatWarning)
            d[key] = "%s,%s" % (d[key],unescape_GTF2(val))

        else:
            d[key] = unescape_GTF2(val)
        
    return d
Beispiel #17
0
def main(argv=sys.argv[1:]): 
    """Command-line program
    
    Parameters
    ----------
    argv : list, optional
        A list of command-line arguments, which will be processed
        as if the script were called from the command line if
        :py:func:`main` is called directly.

        Default: sys.argv[1:] (actually command-line arguments)
    """
    ap = AnnotationParser()
    bp = BaseParser()
    annotation_parser = ap.get_parser()
    base_parser = bp.get_parser()

    parser = argparse.ArgumentParser(description=format_module_docstring(__doc__),
                                     parents=[base_parser,annotation_parser],
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--no_escape",default=True,action="store_false",
                        help="If specified and output format is GTF2, special characters in column 9 will be escaped (default: True)")
    parser.add_argument("--output_format",choices=["BED","GTF2"],default="GTF2",
                        help="Format of output file. (default: GTF2)")
    parser.add_argument("--extra_columns",nargs="+",default=[],type=str,
                        help="Attributes (e.g. 'gene_id' to output as extra columns in extended BED format (BED output only).")
    parser.add_argument("--empty_value",default="na",type=str,
                        help="Value to use of an attribute in `extra_columns` is not defined for a particular record (Default: 'na'")
    parser.add_argument("outfile",metavar="outfile.[ bed | gtf ]",type=str,
                        help="Output file")
    args = parser.parse_args(argv)
    bp.get_base_ops_from_args(args)

    end_message = ""    
    extra_cols = args.extra_columns
    if extra_cols is not None:
        if args.output_format == "BED":
            
            # avoid name clashes
            names_used = copy.copy(BED12_RESERVED_NAMES)
            asql_names = [fix_name(X,names_used) for X in extra_cols]
            autosql_str = "\n".join(AUTOSQL_ROW_FMT_STR % (X," "*max(15-len(X),2)) for X in asql_names)
            
            file_info = {
                "outbase" : args.outfile.replace(".bed","").replace(".gtf",""),
                "numcols" : len(extra_cols),
                "autosql" : DEFAULT_AUTOSQL_STR % (os.path.basename(args.outfile[:-4]),autosql_str),
                         
            }
            end_message = MAKE_BIGBED_MESSAGE % file_info
        else:
            warn("`--extra_columns` is ignored for %s-formatted output." % (args.output_format),ArgumentWarning)
            
            
    with argsopener(args.outfile,args,"w") as fout:
        c = 0
        transcripts = ap.get_transcripts_from_args(args,printer=printer)
        
        for transcript in transcripts:
            if args.output_format == "GTF2":
                fout.write(transcript.as_gtf(escape=args.no_escape))
            elif args.output_format == "BED":
                fout.write(transcript.as_bed(extra_columns=extra_cols,empty_value=args.empty_value))
            if c % 1000 == 1:
                printer.write("Processed %s transcripts ..." % c)
            c += 1
    
    printer.write("Processed %s transcripts total." % c)
    printer.write("Done.")
    print(end_message)