base_count_line = Martel.Group("base_count_line", Martel.Str("BASE COUNT") + blank_space + base_count + Martel.AnyEol()) # ORIGIN # 1 ggacaaggcc aaggatgctg ctgctgcagc tggagcttcc gcgcaacaag taaacagata origin_line = Martel.Group("origin_line", Martel.Str("ORIGIN") + (Martel.ToEol("origin_name") | Martel.AnyEol())) base_number = Martel.Group("base_number", Martel.Re("[\d]+")) sequence = Std.sequence(Martel.Group("sequence", Martel.Re("[\w]+"))) sequence_plus_spaces = Martel.Group("sequence_plus_spaces", Martel.Rep1(Martel.Str(" ") + Martel.Opt(sequence)) + Martel.Opt(Martel.Str(" "))) sequence_line = Martel.Group("sequence_line", blank_space + Martel.Opt(base_number) + sequence_plus_spaces + Martel.AnyEol()) sequence_entry = Std.sequence_block(Martel.Group("sequence_entry", origin_line + Martel.Rep1(sequence_line))) # CONTIG
base_count_line = Martel.Group("base_count_line", Martel.Str("BASE COUNT") + blank_space + base_count + Martel.AnyEol()) # ORIGIN # 1 ggacaaggcc aaggatgctg ctgctgcagc tggagcttcc gcgcaacaag taaacagata origin_line = Martel.Group("origin_line", Martel.Str("ORIGIN") + (Martel.ToEol("origin_name") | Martel.AnyEol())) base_number = Martel.Group("base_number", Martel.Re("[\d]+")) sequence = Std.sequence(Martel.Re(r"[\w ]+")) sequence_line = Martel.Group("sequence_line", blank_space + Martel.Opt(base_number) + sequence + Martel.AnyEol()) sequence_entry = Std.sequence_block(origin_line + Martel.Rep1(sequence_line)) # CONTIG # this is the contig information for RefSeq records contig_location = Martel.Group("contig_location", Martel.ToEol("feature_location") + \
#--- SQ # SQ SEQUENCE XXXX AA; XXXXX MW; XXXXX CRC32; # (Those X's don't really indicate the size) SQ = Martel.Group("SQ", Martel.Re("SQ SEQUENCE +(?P<sequence_length>\d+) AA;" \ " +(?P<molecular_weight>\d+) MW;" \ " +(?P<crc?type=32>\w+) CRC32;\R") ) ##SQ_data = Martel.Group("SQ_data", ## Martel.Re(" (?P<sequence>[^\R]*)\R")) SQ_data = Martel.Str(" ") + \ Std.sequence(Martel.UntilEol()) + \ Martel.AnyEol() ##sequence = Martel.Group("sequence_block", Martel.Group("SQ_data_block", ## SQ + Martel.Rep(SQ_data))) sequence = Std.sequence_block(SQ + Martel.Rep(SQ_data), {"alphabet": "iupac-ambiguous-protein"}) #--- // end = Martel.Group("END", Martel.Str("//") + Martel.AnyEol()) ####################### put it all together record = Std.record(
Martel.Str(" BP; ") + \ Martel.Digits("num_A") + \ Martel.Str(" A; ") + \ Martel.Digits("num_C") + \ Martel.Str(" C; ") + \ Martel.Digits("num_G") + \ Martel.Str(" G; ") + \ Martel.Digits("num_T") + \ Martel.Str(" T; ") + \ Martel.Digits("num_other") + \ Martel.Str(" other;") + \ Martel.AnyEol() ## bb - (blanks) sequence data (>=1 per entry) SQ_data = Martel.Str(" ") + \ Std.sequence(Martel.Re(".{65}")) + \ whitespace + \ Martel.Digits("end_position") + \ Martel.AnyEol() SQ_block = Std.sequence_block(SQ_line + Martel.Rep1(SQ_data)) ## // - termination line (ends each entry; 1 per entry) end = Martel.Str("//") + Martel.AnyEol() record = Martel.Group("record", \ ID_line + \ Martel.Opt(XX) + \ AC_block + \ Martel.Opt(XX) + \ SV_line + \
# "|" them all together ncbi_word = Std.dbxref(reduce(operator.or_, ids)) #ncbi_term = Assert(Re("[^ \R]+\|")) + \ ncbi_term = ncbi_word + Rep(Str("|") + ncbi_word) # Anything else generic_term = Std.dbxref( Std.dbxref_dbid(UntilSep(sep = " "), {"dbname": "local"}) ) id_term = ncbi_term | generic_term ########################################################### comment_lines = Rep(Str("#") + ToEol()) title = Str(">") + Std.description_line(id_term + UntilEol()) + AnyEol() seqline = AssertNot(Str(">")) + Std.sequence(UntilEol()) + AnyEol() # can get a sequence line without an Eol at the end of a file seqline_nonewline = AssertNot(Str(">")) + Std.sequence(Word()) sequence = Std.sequence_block(Rep(seqline | seqline_nonewline)) record = Std.record(comment_lines + title + sequence + Rep(AnyEol())) # define a format which reads records, but allows #-style comments in # the FASTA file format = HeaderFooter("dataset", {"format": "fasta"}, comment_lines, RecordReader.Until, (">",), record, RecordReader.StartsWith, (">",), comment_lines, RecordReader.Everything, () )