def writeTarquin(self, outpath):
        #Tarquindir = outpath + '\\' + 'Tarquin_files'
        outpath = Path(outpath)
        Tarquindir = outpath / 'Tarquin_files'

        if os.path.isdir(Tarquindir) == False:
            os.chdir(outpath)
            os.mkdir('Tarquin_files')

        name = self.filename[(self.filename.rfind('\\') + 1):].translate(
            str.maketrans('', '', r'.'))
        #file_path = Tarquindir + '\\' + self.dirpass + '__' + name + 'proc_Tarquin'
        file_path = Path(Tarquindir, name + 'proc_Tarquin')
        print(file_path)

        Spec_temp = self.SpecData
        counter = 0

        #Need complex conj for proper display, hence -imag
        for b in range(0, old_div(self.Frames, 2)):
            for a in range(0, self.Datapoints):
                Spec_temp[counter] = self.Kspacewrite[b][a].real
                counter = counter + 1
                Spec_temp[counter] = -self.Kspacewrite[b][a].imag
                counter = counter + 1

        self.ds[0x5600, 0x0020].value = Spec_temp
        self.ds.save_as(str(file_path.resolve()))
Exemple #2
0
    def performOpen(self, options={}):
        """Perform the operation of opening the instrument connection"""
        # calling the generic VISA open to make sure we have a connection
        VISA_Driver.performOpen(self, options=options)

        #Detect options: (vector) magnet and swicth heater
        detectedOptions = []
        table = str.maketrans(dict.fromkeys(string.ascii_letters + '/'))
        rate = self.askAndLog('READ:SYS:VRM:RFMX').strip().rsplit(
            ':', 1)[1][1:-1].translate(table).split()
        if float(rate[0]) > 0:
            detectedOptions.append("x magnet")
        if float(rate[1]) > 0:
            detectedOptions.append("y magnet")
        if float(rate[2]) > 0:
            detectedOptions.append("z magnet")

        heater = self.askAndLog('READ:SYS:VRM:SWHT').strip().rsplit(
            ':', 1)[1][1:-1].split()
        if heater[0] != "NOSW" or heater[1] != "NOSW" or heater[2] != "NOSW":
            detectedOptions.append("switch heater")

        self.instrCfg.setInstalledOptions(detectedOptions)

        # Make sure that the coordinate system matches the device
        coordFunc = self.instrCfg.getQuantity('CoordSys')
        v = self.performGetValue(coordFunc)
        coordFunc.setValue(v)
Exemple #3
0
def translator(*args):
    trans = str.maketrans(*args)

    def _(text):
        return text.translate(trans)

    return _
Exemple #4
0
def translate_nt_to_RY(seq):
    """Translates nucleotides to RY (A,G -> R; C,U,T -> Y).

    >>> translate_nt_to_RY("ACGUTACGUT")
    RYRYYRYRYY
    """
    trans_table = str.maketrans("AGCUT", "RRYYY")
    trans_seq = seq.translate(trans_table)
    logging.debug(seq + " -> " + trans_seq)
    return trans_seq
Exemple #5
0
def factory(name, value, paramTypes=None, **kwargs):
    """
    Generates a new Parameter type derived from one of the predefined
    base classes choosen by the supplied value: Providing a string value
    results in a type derived from ParameterBase, providing an integer
    value produces a ParameterNumerical type and a float value results
    in a ParameterFloat type.
    Alternatively, a class type cls can be provided which is used as base
    class for the resulting Parameter class type. Make sure in this case,
    all attributes mandatory for this base type are provided too.

    - *name*: short name of the new parameter without spaces
    - *value*: default value from which the type is derived if cls is not given

    Optional arguments:

    - *paramTypes*:  tuple of available parameter types instead of the default
    - *cls*:         forces a certain Parameter type.
    - *description*: Updates the __doc__ attribute. May be displayed in the UI
                     somewhere.
    """
    kwargs.update(name=name, value=value)
    name = kwargs.get("name", None)
    assertName(name, ParameterNameError)
    value = kwargs.get("value", None)
    cls = kwargs.pop("cls", None)  # remove 'cls' keyword before forwarding
    if paramTypes is None:
        paramTypes = (ParameterBoolean, ParameterFloat, ParameterNumerical,
                      ParameterBase)
    if not (cls is not None and
            ((isinstance(cls, super)
              and issubclass(cls.__thisclass__, ParameterBase))
             or issubclass(cls, ParameterBase))):
        for cls in paramTypes[:-1]:
            if cls.isDataType(value):
                break
        else:
            cls = paramTypes[-1]  # ParameterBase usually
    # embed description as class documentation
    clsdict = dict()
    description = kwargs.get("description", None)
    if isString(description) and len(description) > 0:
        clsdict['__doc__'] = description
    # create a new class/type with given name and base class
    # translate works different for unicode strings:
    typeName = (str(name.title()).translate(str.maketrans("", "", ' \t\n\r')) +
                "Parameter")
    NewType = None
    try:
        NewType = type(typeName, (cls, ), clsdict)
    except TypeError:  # Python 2: type() argument 1 must be string, not unicode
        NewType = type(typeName.encode('ascii', 'ignore'), (cls, ), clsdict)
    # set up the new class before return
    return NewType.setAttributes(**kwargs)
Exemple #6
0
    def _parse_words(self,
                     text,
                     filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                     lower=True,
                     split=' '):
        if lower:
            text = text.lower()

        translate_map = str.maketrans(filters, split * len(filters))

        text = text.translate(translate_map)
        seq = text.split(split)
        return [i for i in seq if i]
Exemple #7
0
    def _save_url(self, url_data, content, url_text, url_pos):
        """Saves url. Converts url to 1-line text and url position as offset from the file beginning to (line, column).

        :param url_data: object for url storing
        :param content: file content
        :param url_text: url text
        :param url_pos: url position from the beginning
        """
        line = content.count('\n', 0, url_pos) + 1
        column = url_pos - content.rfind('\n', 0, url_pos)
        url_data.add_url(url_text.translate(str_text.maketrans("", "", '\n ')),
                         line=line,
                         column=column)
def translateFastaAlphabet(source, mapping):
    out = []

    # create translation table to be used by str.translate()
    s_from = ''.join(list(mapping.keys()))
    s_to = ''.join(list(mapping.values()))
    assert (len(s_from) == len(s_to))
    trans = str.maketrans(s_from, s_to)

    for orig in source:
        translatedSeq = str(orig.seq).translate(trans)
        out.append(
            SeqRecord(Seq(translatedSeq),
                      id=orig.id,
                      name=orig.name,
                      description=orig.description))
        #out.append( SeqRecord( Seq(translatedSeq), id=orig.id, description="REMOVE" ) )

    outname = "/tmp/test.fna"
    SeqIO.write(out, outname, "fasta")
    return outname
Exemple #9
0
class keep_chars(object):
    """Returns a filter object o(s): call to return a filtered string.

    Specifically, strips out everything in s that is not in keep.
    This filter is case sensitive by default.
    """
    allchars = str.maketrans('', '')

    def __init__(self, keep, case_sens=True):
        """Returns a new keep_chars object, based on string keep"""
        if not case_sens:
            low = keep.lower()
            up = keep.upper()
            keep = low + up
        self.delchars = ''.join([c for c in self.allchars if c not in keep])

    def __call__(self, s, a=None, d=None):
        """f(s) -> s, translates using self.allchars and self.delchars"""
        if a is None: a = self.allchars
        if d is None: d = self.delchars
        return s.translate(a, d)
Exemple #10
0
 def setTargetField(self, axis, value, sCmd):
     #Vector results depend on the coordinate system
     coordFunc = self.instrCfg.getQuantity('CoordSys')
     if self.Bchanged == False:
         self.askAndLog('SET:SYS:VRM:ACTN:HOLD')
         self.waitForIdle()
         self.performSetValue(coordFunc, coordFunc.getValue())
         self.performGetValue(coordFunc)
     vectValue = self.askAndLog("READ:SYS:VRM:VSET").strip()
     table = str.maketrans(dict.fromkeys(string.ascii_letters))
     a, b, c = vectValue.rsplit(':', 1)[1][1:-1].translate(table).split()
     if coordFunc.getValue() == 'Cartesian':
         if axis == 'Bx':
             a = value
         elif axis == 'By':
             b = value
         elif axis == 'Bz':
             c = value
     elif coordFunc.getValue() == 'Cylindrical':
         a = self.instrCfg.getQuantity('Brho').getValue()
         b = self.instrCfg.getQuantity('Btheta').getValue()
         c = self.instrCfg.getQuantity('Bz').getValue()
         if axis == 'Brho':
             a = value
         elif axis == 'Btheta':
             b = value
         elif axis == 'Bz':
             c = value
     elif coordFunc.getValue() == 'Spherical':
         a = self.instrCfg.getQuantity('Br').getValue()
         b = self.instrCfg.getQuantity('Btheta').getValue()
         c = self.instrCfg.getQuantity('Bphi').getValue()
         if axis == 'Br':
             a = value
         elif axis == 'Btheta':
             b = value
         elif axis == 'Bphi':
             c = value
     sMsg = sCmd.replace('<*>', str(a) + " " + str(b) + " " + str(c))
     self.askAndLog(sMsg)
    def writelogfile(self, outpath, version):
        outpath = Path(outpath)
        #Logdir = outpath + '\\' + 'Log_files'
        Logdir = outpath / 'Log_files'

        if os.path.isdir(Logdir) == False:
            os.chdir(outpath)
            os.mkdir('Log_files')

        if self.Frames == 1:
            frames = 1
        else:
            frames = old_div(
                self.Frames,
                2)  #self.Frames / 2 because NWS data also stored in Dicom file

        name = self.filename[(self.filename.rfind('\\') + 1):].translate(
            str.maketrans('', '', r'.'))
        #file_path = Logdir + '\\' + self.dirpass + '__' + name + 'log_file.txt'
        file_path = Path(Logdir, name + 'log_file.txt')

        #self.text_file = open(file_path, 'w')
        self.text_file = open(str(file_path.resolve()), 'w')

        # Write Log File
        self.text_file.write('Tarquin Pre-processing Log file\n\n')
        print('Filename: %s\n' % (file_path), file=self.text_file)
        print('Version: %s\n' % (version), file=self.text_file)

        for cnt in range(0, frames):
            print('Frame: %i' % (cnt), file=self.text_file)
            print('Include: %i' % (self.IncludeFrame[cnt]),
                  file=self.text_file)
            print('Phasing: %i' % (self.optphasearr[cnt]), file=self.text_file)
            intostr = 'Peak positions: ' + str(self.peakposarr[cnt])
            self.text_file.write(intostr + '\n\n')

        self.text_file.close()
        print('Log file written')
Exemple #12
0
def text_to_word_sequence(txt,
                          filters=string.punctuation + '\n\t',
                          lower=True,
                          rmSingleChar=True,
                          split=" ",
                          maxLength=None):
    """Converts a text to a sequence of words (or tokens).

    Args:
        txt (str): Input text (string).
        filters (str,optional): Sequence of characters to filter out.
        lower (bool,optional): Whether to convert the input to lowercase.
        rmSingleChar (bool,optional): Whether to remove words with a single letter.
        split (bool,optional): Sentence split marker (string).
        maxLength (int,optional): max length of a text. Drops the rest.

    Returns:
        A list of words (or tokens).
    """
    maxLen = float("inf") if maxLength is None else maxLength

    if lower:
        txt = txt.lower()

    if sys.version_info < (3, ) and isinstance(text, unicode):
        translate_map = dict((ord(c), unicode(split)) for c in filters)
    else:
        translate_map = str.maketrans(filters, split * len(filters))

    txt = txt.translate(translate_map)

    for i, el in enumerate(txt.split(split)):
        if rmSingleChar and len(el) == 1:
            continue
        if i >= maxLen:
            break
        if el:
            yield el
Exemple #13
0
def escape(str):
    if isinstance(str, list):
        return [escape(i) for i in str]
    else:
        return str.translate(str.maketrans({"\\": r"\\"}))
Exemple #14
0
 def performGetValue(self, quant, options={}):
     """Perform the Get Value instrument operation"""
     #on first call clear B-result buffer
     if self.isFirstCall(options):
         self.Bresult = []
     # check type of quantity
     if quant.name in ('T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9',
                       'T10', 'T11', 'T12', 'T13'):
         # temperatures, get value strings
         sAns = self.askAndLog(quant.get_cmd).strip()
         # convert string to float by taking everything after last colon, ignoring final 'K'
         value = float(sAns.rsplit(':', 1)[1][:-1])
     elif quant.name in ('ControlLoop'):
         for i in range(1, 14):
             pass
             sAns = self.askAndLog(quant.get_cmd.replace('<c>',
                                                         str(i))).strip()
             sAns = sAns.rsplit(':', 1)[1]
             if sAns != "NOT_FOUND":
                 value = (sAns == "ON")
                 break
     elif quant.name in ('TSet'):
         for i in range(1, 14):
             sAns = self.askAndLog(quant.get_cmd.replace('<c>',
                                                         str(i))).strip()
             sAns = sAns.rsplit(':', 1)[1]
             if sAns != "NOT_FOUND":
                 value = float(sAns[:-1])
                 break
     elif quant.name in ('HeaterRange'):
         for i in range(1, 14):
             sAns = self.askAndLog(quant.get_cmd.replace('<c>',
                                                         str(i))).strip()
             sAns = sAns.rsplit(':', 1)[1]
             if sAns != "NOT_FOUND":
                 value = quant.getValueFromCmdString(sAns[:-2])
                 break
     elif quant.name in ('PoC'):
         sAns = self.askAndLog(quant.get_cmd).strip()
         value = (sAns.rsplit(':', 1)[1] == "ON")
     elif quant.name in ('CoordSys'):
         sAns = self.askAndLog(quant.get_cmd).strip()
         value = quant.getValueFromCmdString(sAns.rsplit(':', 1)[1])
     elif quant.name in ('Bx', 'By', 'Bz', 'Br', 'Brho', 'Bphi', 'Btheta'):
         coordFunc = self.instrCfg.getQuantity('CoordSys')
         if not self.Bresult:
             vectValue = self.askAndLog(quant.get_cmd).strip()
             table = str.maketrans(dict.fromkeys(string.ascii_letters))
             self.Bresult = vectValue.rsplit(
                 ':', 1)[1][1:-1].translate(table).split()
         #Vector results depend on the coordinate system
         value = float('nan')
         if coordFunc.getValue() == 'Cartesian':
             if quant.name == 'Bx':
                 return float(self.Bresult[0])
             elif quant.name == 'By':
                 return float(self.Bresult[1])
             elif quant.name == 'Bz':
                 return float(self.Bresult[2])
         elif coordFunc.getValue() == 'Cylindrical':
             if quant.name == 'Brho':
                 return float(self.Bresult[0])
             elif quant.name == 'Btheta':
                 return float(self.Bresult[1])
             elif quant.name == 'Bz':
                 return float(self.Bresult[2])
         elif coordFunc.getValue() == 'Spherical':
             if quant.name == 'Br':
                 return float(self.Bresult[0])
             elif quant.name == 'Btheta':
                 return float(self.Bresult[1])
             elif quant.name == 'Bphi':
                 return float(self.Bresult[2])
     else:
         # for all other cases, call VISA driver
         cmd = quant.get_cmd
         if (cmd is not None) and (cmd != ''):
             value = self.askAndLog(cmd).strip().rsplit(':', 1)[1]
         else:
             value = quant.getValue()
     return value
Exemple #15
0
 def setName(cls, name):
     """Changing the name is allowed for the class/type only,
     not for instances."""
     assertName(name, ParameterNameError)
     safename = str(name).translate(str.maketrans("", "", ' \t\n\r'))
     cls._name = safename
Exemple #16
0
import dedupe.levenshtein as levenshtein

words = re.compile(r"[\w']+").findall
integers = re.compile(r"\d+").findall
start_word = re.compile(r"^([\w']+)").match
start_integer = re.compile(r"^(\d+)").match
alpha_numeric = re.compile(r"(?=.*\d)[a-zA-Z\d]+").findall

if sys.version < '3':
    PUNCTUATION = string.punctuation

    def strip_punc(s):
        s = s.encode('utf-8').translate(None, PUNCTUATION)
        return s.decode('utf-8')
else:
    PUNCTABLE = str.maketrans("", "", string.punctuation)

    def strip_punc(s):
        return s.translate(PUNCTABLE)


class Predicate(object):
    def __iter__(self):
        yield self

    def __repr__(self):
        return "%s: %s" % (self.type, self.__name__)

    def __hash__(self):
        try:
            return self._cached_hash
#


from __future__ import division
from __future__ import print_function
from builtins import str
from builtins import range
from builtins import object
from past.utils import old_div
import sys
import string
import random
import os
import re
import pysam
tt = str.maketrans("ACTGactg","TGACtgac")

class fastaWriter(object):
    def __init__(self,filename,linelen=60):
#        print(filename, file=sys.stderr)
        self.f=open(filename,"w")
#        print(self.f, file=sys.stderr)
        self.linelen=linelen
        self.buff=""
        self.x=0
        self.name=""

    def write(self,s):
        self.buff+=s
        self.x+=len(s)
#        print(len(self.buff),self.linelen, file=sys.stderr)
else:
    timerForPreFolding  = DummyTimer()
    timerForFolding     = DummyTimer()
    timerForPostFolding = DummyTimer()
    

def parseOption(possibleValues, name):
    def checkOption(value):
        if value in possibleValues:
            return value
        else:
            raise argparse.ArgumentTypeError("Unknown %s '%s', allowed values: %s" % (name, value, ",".join(possibleValues)))
    return checkOption


translateAllDeterminedNucleotides = str.maketrans("acgtACGT", "&&&&&&&&")
translateGCNucleotides            = str.maketrans("gcGC",     "&&&&")
translatePurineNucleotides        = str.maketrans("agAG",     "&&&&")
def calcWindowGCContent(seq:str) -> float:
    allCount = seq.translate( translateAllDeterminedNucleotides ).count('&')
    if allCount==0:
        return nan
    
    gcCount  = seq.translate( translateGCNucleotides            ).count('&')

    return gcCount/allCount

def calcWindowPurineContent(seq:str) -> float:
    allCount     = seq.translate( translateAllDeterminedNucleotides ).count('&')
    if allCount==0:
        return nan
Exemple #19
0
import dedupe.levenshtein as levenshtein

words = re.compile(r"[\w']+").findall
integers = re.compile(r"\d+").findall
start_word = re.compile(r"^([\w']+)").match
start_integer = re.compile(r"^(\d+)").match
alpha_numeric = re.compile(r"(?=.*\d)[a-zA-Z\d]+").findall

if sys.version < '3':
    PUNCTUATION = string.punctuation

    def strip_punc(s):
        s = s.encode('utf-8').translate(None, PUNCTUATION)
        return s.decode('utf-8')
else:
    PUNCTABLE = str.maketrans("", "", string.punctuation)

    def strip_punc(s):
        return s.translate(PUNCTABLE)


class Predicate(object):
    def __iter__(self):
        yield self

    def __repr__(self):
        return "%s: %s" % (self.type, self.__name__)

    def __hash__(self):
        try:
            return self._cached_hash
import uuid
from builtins import (ascii, bytes, chr, dict, filter, hex, input, int, map,
                      next, oct, open, pow, range, round, str, super, zip)
from itertools import chain, combinations, repeat
from operator import itemgetter
from warnings import warn

import numpy as np
import pandas as pd
from IPython.core.display import display
from past.utils import old_div

import editdistance

# print(string.punctuation, type(unicode(string.punctuation,'utf-8')))
TRAN_TBL = str.maketrans(str(string.punctuation), u' ' * len(string.punctuation))


def make_good_label(x_value):
    """Return something that is a better label.

    Arguments:
        x_value {string} -- or something that can be converted to a string
    """
    # if isinstance(x_value, str):
    #     x_value = x_value.encode('ascii', 'ignore')
    return '_'.join(str(x_value).translate(TRAN_TBL).split()).lower()


def mash(dframe, flds=None, keep_zeros=False):
    """Returns df of non-null and non-zero on flds
#
#

from __future__ import division
from __future__ import print_function
from builtins import str
from builtins import range
from builtins import object
from past.utils import old_div
import sys
import string
import random
import os
import re
import pysam
tt = str.maketrans("ACTGactg", "TGACtgac")


class fastaWriter(object):
    def __init__(self, filename, linelen=60):
        #        print(filename, file=sys.stderr)
        self.f = open(filename, "w")
        #        print(self.f, file=sys.stderr)
        self.linelen = linelen
        self.buff = ""
        self.x = 0
        self.name = ""

    def write(self, s):
        self.buff += s
        self.x += len(s)
Exemple #22
0
'''
Updated 19DEC2016

@author: Matt Brewer
@organization: University of Bristol
@contact: [email protected]
@summary: Module to handle reading of FASTA files
'''
from __future__ import print_function
from builtins import str, range
import re
from collections import deque

TRANS_TABLE = str.maketrans('ATCG', 'TAGC')
RNA_TRANS_TABLE = str.maketrans('AUCG', 'UAGC')


class Codons(object):
    '''codon list'''

    # TODO: implement different DNA translation tables
    def __init__(self):
        self.codons = {
            'TTT': 'F',
            'TTC': 'F',
            'TTA': 'L',
            'TTG': 'L',
            'TCT': 'S',
            'TCC': 'S',
            'TCA': 'S',
            'TCG': 'S',
    def fitTarquin(self, outpath):
        nameinit = self.PatName
        dialog = PatNameDialog(nameinit)
        if dialog.exec_():
            name = dialog.name.text()

        try:
            self.PatName = name
        except:
            self.PatName = nameinit

        outpath = Path(outpath)
        #Tarquindir = outpath + '\\' + 'Tarquin_files'
        Tarquindir = outpath / 'Tarquin_files'
        name = self.filename[(self.filename.rfind('\\') + 1):].translate(
            str.maketrans('', '', r'.'))
        filename = name + 'proc_Tarquin'
        #file_path = Tarquindir + '\\' + filename
        file_path = str(Path(Tarquindir, filename).resolve())

        #Tarquinfitdir = Tarquindir + '\\' + 'Tarquin_fit'
        Tarquinfitdir = Tarquindir / 'Tarquin_fit'

        # if os.path.isdir(Tarquinfitdir) == False:
        #     os.chdir(Tarquindir)
        #     os.mkdir('Tarquin_fit')
        #
        Tarquinfitdir.mkdir(parents=True, exist_ok=True)

        # reportout = Tarquinfitdir + '\\' + self.PatName + '_Report.pdf'
        # tempout = Tarquinfitdir + '\\' + filename + '_temp.pdf'
        # pdfout = Tarquinfitdir + '\\' + filename + '_plot.pdf'
        # dataout = Tarquinfitdir + '\\' + filename + '_data.csv'
        # moddataout = Tarquinfitdir + '\\' + filename + '_data_with_ratios.csv'
        # resout = Tarquinfitdir + '\\' + filename + '_results.csv'
        # self.fitout = Tarquinfitdir + '\\' + filename + '_fit.txt'
        # basis = 'S:\\Neonate_data\\Tarquin\\3_0T_basis_threonine_no_MM'
        # tarquin = 'S:\\Neonate_data\\Tarquin\\TARQUIN_Windows_4.3.7\\tarquin\\tarquin'

        reportout = str(
            Path(Tarquinfitdir,
                 str(self.PatName) + '_Report.pdf').resolve())
        #reportout = Path(Tarquinfitdir , self.PatName + '_Report.pdf')
        tempout = str(Path(Tarquinfitdir, filename + '_temp.pdf').resolve())
        pdfout = str(Path(Tarquinfitdir, filename + '_plot.pdf').resolve())
        dataout = str(Path(Tarquinfitdir, filename + '_data.csv').resolve())
        moddataout = str(
            Path(Tarquinfitdir, filename + '_data_with_ratios.csv').resolve())
        resout = str(Path(Tarquinfitdir, filename + '_results.csv').resolve())
        self.fitout = str(Path(Tarquinfitdir, filename + '_fit.txt').resolve())

        basis = str(Path(BASE_DIR, '3_0T_basis_threonine_no_MM').resolve())
        print(f'basis: {basis}')
        tarquin_path = Path(BASE_DIR, 'tarquingui.app/Contents/MacOS/tarquin')

        if tarquin_path.exists():
            tarquin = str(tarquin_path.resolve())
        elif shutil.which("tarquin"):
            tarquin = shutil.which("tarquin")
        else:
            error = f'\nTarquin not found. \nTo solve it please:\n a) copy the Tarquin app inside {BASE_DIR} folder, or\n b) add Tarquin to the Path. e.g. >> export PATH=$PATH:/Applications/tarquingui.app/Contents/MacOS\n'
            print(error)
            sys.exit(error)

        command = (tarquin + ' --input ' + file_path + ' --output_pdf ' +
                   pdfout + ' --output_csv ' + dataout + ' --output_fit ' +
                   self.fitout + ' --basis_csv ' + basis)

        # run the command
        print('this the the command for tarquin: ', command)
        os.system(command)

        #Add in sode code to automatically calculate the Lac/Naa ratio
        #Note that this will assume that the correct basis set is used
        #csvfile = open(dataout, 'rb')

        with open(dataout) as csvfile:
            linereader = csv.reader(csvfile, delimiter=',')
            #linereader = pd.read_csv(dataout,delimiter = ',')
            CSVstore = []

            counter = 0
            for row in linereader:
                counter += 1
                print(row)

                if counter == 2:
                    row.append('Lac+T/tNaa')
                    row.append('tNaa/tCho')
                    row.append('tNaa/Cr')
                    row.append('tCho/Cr')
                    row.append('Lac+T/tCho')
                    row.append('Lac+T/Cr')

                if counter == 5:
                    row.append('Lac+T/tNaa')
                    row.append('tNaa/tCho')
                    row.append('tNaa/Cr')
                    row.append('tCho/Cr')
                    row.append('Lac+T/tCho')
                    row.append('Lac+T/Cr')
            #Calc ratio
                if counter == 3:
                    #dummy = str(row)
                    #dummy = dummy.translate(None, ''.join(["[", "'", "]"]))

                    #print('dummy is: ',dummy)

                    #fields = dummy.split(', ')
                    fields = row

                    print('type of fields[14] is: ', type(fields[14]))
                    print('fields[14] is: ', fields[14])

                    Lac = np.float(fields[14])
                    Naa = np.float(fields[15])
                    NaaG = np.float(fields[16])
                    Thre = np.float(fields[21])
                    Cr = np.float(fields[6])
                    tCho = np.float(fields[23])
                    L_N = old_div((Lac + Thre), (Naa + NaaG))
                    N_Ch = old_div((Naa + NaaG), tCho)
                    N_Cr = old_div((Naa + NaaG), Cr)
                    Ch_Cr = old_div(tCho, Cr)
                    L_Ch = old_div((Lac + Thre), tCho)
                    L_Cr = old_div((Lac + Thre), Cr)
                    row.append(str(L_N))
                    row.append(str(N_Ch))
                    row.append(str(N_Cr))
                    row.append(str(Ch_Cr))
                    row.append(str(L_Ch))
                    row.append(str(L_Cr))

            #calc error
                if counter == 6:
                    dummy = str(row)
                    # #dummy = dummy.translate(None, ''.join(["[", "'", "]"]))
                    #dummy = dummy.translate(''.join(["[", "'", "]"]))
                    fields = row
                    Lace = np.float(fields[14])
                    Naae = np.float(fields[15])
                    NaaGe = np.float(fields[16])
                    Three = np.float(fields[21])
                    Cre = np.float(fields[6])
                    tChoe = np.float(fields[23])

                    Lerr = np.sqrt(np.power(Lace, 2) + np.power(Three, 2))
                    Nerr = np.sqrt(np.power(Naae, 2) + np.power(NaaGe, 2))
                    L_Ne = np.sqrt(
                        np.power(old_div(Lerr, (Lac + Thre)), 2) +
                        np.power(old_div(Nerr, (Naa + NaaG)), 2)) * L_N
                    N_Che = np.sqrt(
                        np.power(old_div(Nerr, (Naa + NaaG)), 2) +
                        np.power(old_div(tChoe, (tCho)), 2)) * N_Ch
                    N_Cre = np.sqrt(
                        np.power(old_div(Nerr, (Naa + NaaG)), 2) +
                        np.power(old_div(Cre, (Cr)), 2)) * N_Cr
                    Ch_Cre = np.sqrt(
                        np.power(old_div(tChoe, (tCho)), 2) +
                        np.power(old_div(Cre, (Cr)), 2)) * Ch_Cr
                    L_Che = np.sqrt(
                        np.power(old_div(Lerr, (Lac + Thre)), 2) +
                        np.power(old_div(tChoe, (tCho)), 2)) * L_Ch
                    L_Cre = np.sqrt(
                        np.power(old_div(Lerr, (Lac + Thre)), 2) +
                        np.power(old_div(Cre, (Cr)), 2)) * L_Cr
                    row.append(str(L_Ne))
                    row.append(str(N_Che))
                    row.append(str(N_Cre))
                    row.append(str(Ch_Cre))
                    row.append(str(L_Che))
                    row.append(str(L_Cre))

            #get FWHM and SNR
                if counter == 9:
                    #dummy = str(row)
                    #dummy = dummy.translate(''.join(["[", "'", "]"]))
                    #fields = dummy.split(", ")
                    fields = row
                    FWHM = np.float(fields[7])
                    SNR = np.float(fields[9])

                CSVstore.append(row)
                #linewriter.writerow(row)
                #
            #csvfile.close()

        resultsout = open(resout, 'w')
        line1 = 'Ratio, Value, Error, Proc FWHM, Proc SNR'
        print(line1)
        line2 = 'L+T/tNaa,' + str(L_N) + ',' + str(L_Ne) + ',' + str(
            FWHM) + ',' + str(SNR)
        line3 = 'tNaa/tCho,' + str(N_Ch) + ',' + str(N_Che)
        line4 = 'tNaa/Cr,' + str(N_Cr) + ',' + str(N_Cre)
        line5 = 'tCho/Cr,' + str(Ch_Cr) + ',' + str(Ch_Cre)
        line6 = 'L+T/tCho,' + str(L_Ch) + ',' + str(L_Che)
        line7 = 'L+T/Cr,' + str(L_Cr) + ',' + str(L_Cre)
        resultsout.write(line1)
        resultsout.write('\n')
        resultsout.write(line2)
        resultsout.write('\n')
        resultsout.write(line3)
        resultsout.write('\n')
        resultsout.write(line4)
        resultsout.write('\n')
        resultsout.write(line5)
        resultsout.write('\n')
        resultsout.write(line6)
        resultsout.write('\n')
        resultsout.write(line7)

        resultsout.close()

        csvout = open(moddataout, 'w')
        for line in CSVstore:
            c = str(line)
            #line2 = c.translate(None, ''.join(["[", "'", "]"]))
            line2 = c.translate(''.join(["[", "'", "]"]))
            #print line2
            csvout.write(line2)
            csvout.write('\n')

        csvout.close()

        pdf = PDF()
        pdf.alias_nb_pages()
        pdf.add_page()
        pdf.set_font('Arial', 'B', 16)
        titleout = 'Spectroscopy Report for ' + str(self.PatName)
        pdf.cell(0, 0, titleout, 0, 0, 'C')
        pdf.ln(15)

        pdf.set_font('Arial', 'B', 14)
        pdf.cell(10)
        pdf.cell(0, 0, 'Metabolite Ratios', 0, 0, 'L')

        pdf.ln(5)
        pdf.cell(10)
        pdf.set_font('Arial', 'B', 12)
        pdf.cell(20, 10, 'Ratio', 1, 0, 'C')
        pdf.cell(25, 10, 'tNaa/tCho', 1, 0, 'C')
        pdf.cell(25, 10, 'tNaa/Cr', 1, 0, 'C')
        pdf.cell(25, 10, 'tCho/Cr', 1, 0, 'C')
        pdf.cell(25, 10, 'L+T/tCho', 1, 0, 'C')
        pdf.cell(25, 10, 'L+T/Cr', 1, 0, 'C')
        pdf.cell(25, 10, 'L+T/tNaa', 1, 1, 'C')

        pdf.cell(10)
        pdf.cell(20, 10, 'Value', 1, 0, 'C')
        pdf.set_font('Arial', '', 10)
        textout = str(round(N_Ch, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(N_Cr, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(Ch_Cr, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(L_Ch, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(L_Cr, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        pdf.set_font('Arial', 'B', 12)
        textout = str(round(L_N, 2))
        pdf.cell(25, 10, textout, 1, 1, 'C')

        pdf.cell(10)
        pdf.cell(20, 10, 'Error', 1, 0, 'C')
        pdf.set_font('Arial', '', 10)
        textout = str(round(N_Che, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(N_Cre, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(Ch_Cre, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(L_Che, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        textout = str(round(L_Cre, 2))
        pdf.cell(25, 10, textout, 1, 0, 'C')
        pdf.set_font('Arial', 'B', 12)
        textout = str(round(L_Ne, 2))
        pdf.cell(25, 10, textout, 1, 1, 'C')

        pdf.ln(3)
        pdf.cell(10)
        pdf.set_font('Arial', 'B', 10)
        pdf.cell(0, 5, 'Notes:', 0, 1, 'L')
        pdf.set_font('Arial', '', 10)
        pdf.cell(10)
        pdf.cell(
            0, 5,
            'L+T = Lactate + Threonine.  Including Threonine yields a better fit at ~ 1.3 ppm',
            0, 1, 'L')
        pdf.cell(10)
        pdf.cell(0, 5, 'tNaa = Total Naa', 0, 1, 'L')
        pdf.cell(10)
        pdf.cell(0, 5, 'tCho = Total Choline', 0, 1, 'L')
        pdf.cell(10)
        pdf.cell(
            0, 5,
            'Errors on ratios calculated using Cramer-Rao low bounds on Tarquin fit',
            0, 1, 'L')

        pdf.ln(5)
        pdf.cell(10)
        pdf.set_font('Arial', 'B', 10)
        pdf.cell(
            0, 5,
            'Some care must be taken when comparing Tarquin ratios to jMRUI ratios:',
            0, 1, 'L')
        pdf.cell(10)
        pdf.set_font('Arial', '', 10)
        pdf.cell(
            0, 5,
            '1) Tarquin fits a complete basis spectrum for each metabolite whereas jMRUI fits individual peaks',
            0, 1, 'L')
        pdf.cell(10)
        pdf.cell(
            0, 5,
            '2) Tarquin effectively produces T2-weighted metabolite concentration ratios',
            0, 1, 'L')
        pdf.cell(10)
        pdf.cell(0, 5,
                 '3) jMRUI effectively produces T2-weighted peak-area ratios',
                 0, 1, 'L')
        pdf.cell(10)
        pdf.cell(
            0, 5,
            '4) The Choline peak has 9 equivalent protons whereas the other prominent peaks have only 3',
            0, 1, 'L')
        pdf.cell(10)
        pdf.cell(
            0, 5,
            '5) This means Tarquin ratios involving Choline are approximately a factor 3 different to jMRUI ratios',
            0, 1, 'L')
        pdf.cell(10)
        pdf.cell(
            0, 5,
            '6) When comparing Cho ratios to previous or published data please be careful of the methodologies used',
            0, 1, 'L')
        pdf.cell(10)
        pdf.cell(0, 5, '7) LCModel data will be similar to Tarquin', 0, 1, 'L')
        pdf.cell(10)
        pdf.cell(
            0, 5,
            '8) If in doubt, please contact Medical Physics for help or clarification',
            0, 1, 'L')

        pdf.ln(15)
        pdf.set_font('Arial', 'B', 14)
        pdf.cell(10)
        pdf.cell(0, 0, 'Spectrum Quality Control', 0, 0, 'L')

        pdf.ln(5)
        pdf.cell(10)
        pdf.set_font('Arial', 'B', 12)
        pdf.cell(40, 10, 'Proc FWHM', 1, 0, 'C')
        pdf.cell(40, 10, 'Proc SNR', 1, 0, 'C')
        pdf.cell(40, 10, 'Echo Time', 1, 1, 'C')
        pdf.set_font('Arial', '', 10)
        pdf.cell(10)
        textout = str(round(FWHM, 2))
        pdf.cell(40, 10, textout, 1, 0, 'C')
        textout = str(round(SNR, 2))
        pdf.cell(40, 10, textout, 1, 0, 'C')
        pdf.cell(40, 10, self.displayTE, 1, 1, 'C')

        pdf.ln(3)
        pdf.cell(10)
        pdf.set_font('Arial', 'B', 10)
        pdf.cell(0, 5, 'Notes:', 0, 1, 'L')
        pdf.set_font('Arial', '', 10)
        pdf.cell(10)
        pdf.cell(
            0, 5,
            'FWHM = Full Width Half Maximum: Measure of linewidth in ppm', 0,
            1, 'L')
        pdf.cell(10)
        pdf.cell(0, 5, 'SNR = Signal to Noise Ratio', 0, 1, 'L')

        pdf.output(tempout, 'F')

        # Merge PDF files
        pdfFileObj1 = open(tempout, 'rb')
        pdfFileObj2 = open(pdfout, 'rb')

        pdfReader1 = PyPDF2.PdfFileReader(pdfFileObj1)
        pdfReader2 = PyPDF2.PdfFileReader(pdfFileObj2)

        pageObj1 = pdfReader1.getPage(0)
        pageObj2 = pdfReader2.getPage(0)
        pageObj2.rotateClockwise(270)

        pdf_writer = PyPDF2.PdfFileWriter()
        pdf_writer.addPage(pageObj1)
        pdf_writer.addPage(pageObj2)

        pdf_out = open(reportout, 'wb')
        pdf_writer.write(pdf_out)
        pdf_out.close()

        pdfFileObj1.close()
        pdfFileObj2.close()

        print(f'\n\nMRS Report saved in {reportout}')
        self.report_completed(reportout)
Exemple #24
0
def asCplexName(name):
    #to remove illegal characters from the names
    trans = str.maketrans("-+[] ->/","________")
    
    return str(name).translate(trans)
Exemple #25
0
def timestampFormat():
    """Format for current local time, suitable for file names.
    >>> timestampFormat()
    '%Y-%m-%d_%H-%M-%S'
    """
    return str(FORMATTER.datefmt).translate(str.maketrans(" :", "_-"))
Exemple #26
0
def _y64_2_b64(s):
    return s.translate(str.maketrans('-._', '=+/'))
flanking3utrLengthKey = "CDS:taxid:%d:protid:%s:3utr-flank-length-nt"
nextCdsOnOppositeStrandKey = "CDS:taxid:%d:protid:%s:next-cds-opp-strand"
#genomicCoordStartKey  = "CDS:taxid:%d:protid:%s:genomic-start"
#genomicCoordEndKey    = "CDS:taxid:%d:protid:%s:genomic-end"
#partialCDSKey        = "CDS:taxid:%d:protid:%s:partial"
speciesCDSList = "species:taxid:%d:CDS"
regexLocusId = re.compile("([^.]+[.][^.]+)[.].*")

r = redis.StrictRedis(host=config.host,
                      port=config.port,
                      db=config.db,
                      password=config.password)
# sequences server (mysql)
session = db.Session()

translateAmbiguousNucleotides = str.maketrans("RrYyKkMmSsWwBbDdHhVv",
                                              "nnnnnnnnnnnnnnnnnnnn")


def storeSeqInDB(nucSeq,
                 taxId: int,
                 proteinId: str,
                 seqSourceTag: int,
                 stopCodonPos: int = -1,
                 genomeCoords: tuple = (),
                 nextCDSonOppositeStrand: bool = None,
                 cdsLengthNt: int = None,
                 flankingRegionLengthNt: int = None) -> None:

    # Compress the CDS sequence
    encodedCds = nucleic_compress.encode(
        str(nucSeq).translate(translateAmbiguousNucleotides)
Exemple #28
0
from __future__ import print_function, division
import sys
import re
import json
from builtins import bytes, str
from .schema import SimpleSchema, SimpleAttribute, make_export_schema
from . import tree

allowable_secedge = {'refint', 'refvc', 'refmod', 'refcontr', 'EN', 'HD', 'SB', 'OA', 'DA', 'CP', 'MO', 'EP', 'SVP',
                     'PPROJ'}

hash_token_re = re.compile('^#+\\s')
# reads lines in an export file and creates a nodes structure
# reads up to and including the #EOS

kill_spaces_tr = str.maketrans(' ', '_')

def read_sentence(f, format=3):
    '''
    reads a sentence in export format from the file descriptor f
    :param format: the Negra-Export version
    :param encoding: if a value is supplied here, the file will
      be assumed to have this encoding
    :param tree_encoding: passing None here means that the tree will
      contain unicode strings in the word, lemma, and comment fields,
      otherwise they will follow this encoding
    '''
    t = tree.Tree()
    secedges = []
    pos = 0
    l = f.readline().strip()
Exemple #29
0
def asCplexName(name):
    #to remove illegal characters from the names
    trans = str.maketrans("-+[] ->/", "________")

    return str(name).translate(trans)
def standalone():
    argsParser = argparse.ArgumentParser()
    argsParser.add_argument("--taxid", type=int)
    argsParser.add_argument("--input")
    argsParser.add_argument("--variant",
                            type=parseOption(
                                set(("yeastgenome", "NCBI", "Ensembl", "JGI")),
                                "variant"))
    argsParser.add_argument("--type",
                            type=parseOption(
                                set(("cds", "shuffle", "fixCDSkey")),
                                "sequence type"))
    argsParser.add_argument("--dry-run", action="store_true", default=False)
    argsParser.add_argument("--output-fasta")
    argsParser.add_argument("--gene-ids-file")
    argsParser.add_argument("--alt-protein-ids",
                            type=parseOption(set(("locus_tag", )),
                                             "alt-protein-id"))
    argsParser.add_argument("--headers-from-another-fasta")
    argsParser.add_argument("--ignore-id-check",
                            action="store_true",
                            default=False)
    args = argsParser.parse_args()

    if (args.output_fasta):
        if (args.output_fasta == args.input):
            raise Exception("Fasta output file cannot match input file!")

    #if( len(sys.argv) < 5 ):
    #    print("Usage: %s <taxid> <fasta-file> <fasta-variant> <cds|shuffle>" % (sys.argv[0],))
    #    sys.exit(-1)

    # command-line arguments
    taxId = args.taxid
    f = None
    if (args.input[-3:] == ".gz"):
        f = gzip.open(args.input, "r")
    elif (args.input[-4:] == ".bz2"):
        # TODO: impl this...
        assert (False)
    else:
        f = open(args.input, 'r')
    #sequenceFormat = args.variant
    sequenceType = args.type

    if (sequenceType == "cds"):
        seqSourceTag = db.Sources.External
    elif (sequenceType == "shuffle"):
        seqSourceTag = db.Sources.ShuffleCDSv2_matlab
    elif (sequenceType == "fixCDSkey"):
        seqSourceTag = None
    else:
        raise Exception("Unknown sequence type '%s'" % sequenceType)

    # establish connections
    # metadata server (redis)
    #r = redis.StrictRedis(host=config.host, port=config.port, db=config.db, password=config.password)
    # sequences server (mysql)
    #session = db.Session()

    visitedProteinIds = set()

    assert (r.exists("species:taxid:%d:name" % taxId))

    if (seqSourceTag == db.Sources.External):
        # Clear any previously imported CDSs...
        #r.delete(speciesCDSList % (taxId,))
        count = data_helpers.countSpeciesCDS(taxId)
        if (count > 0 and (not args.dry_run)):
            print("%d sequences already exist for specied %d. Aborting..." %
                  (count, taxId))
            sys.exit(-1)
    elif (sequenceType == "fixCDSkey"):
        r.delete(speciesCDSList % (taxId, ))
        # Delete and reconstruct the CDS key
    else:
        assert (data_helpers.countSpeciesCDS(taxId) > 0)

    reNuclearYeastGene = re.compile("Y[A-P][RL]\d+[CW](-[A-Z])?")
    geneIdsToInclude = set()
    if (args.gene_ids_file):
        with open(args.gene_ids_file, "r") as genesFile:
            for geneId in genesFile:
                geneIdsToInclude.add(geneId.rstrip())

    reNCBIattributes = re.compile("\[(\S+)=([^\]]+)\]")
    reNCBIbareheader = re.compile("\w+\|\w+\.\d+_cds_(\w+.\d+)_\d+")
    outRecords = []

    headersFromAnotherFasta = {}
    if args.headers_from_another_fasta:
        with open(args.headers_from_another_fasta, "r") as f2:
            for record in SeqIO.parse(f2, "fasta", alphabet=generic_dna):
                assert (not record.id in headersFromAnotherFasta)
                headersFromAnotherFasta[record.id] = record.description

    cdsCount = 0
    notFoundCount = 0
    skippedCount = 0
    validNucleotideChars = str.maketrans("ACGTacgt", "%%%%%%%%")
    #print("Opening fasta file: {}".format(f))
    for record in SeqIO.parse(f, "fasta", alphabet=generic_dna):
        #proteinId = regexLocusId.match(record.id).group(1) # Work-around for multiple-transcript identifiers in JGI's Chlamy genome

        if args.headers_from_another_fasta:
            record.description = headersFromAnotherFasta[record.id]

        numNonNucleotideChars = len(record.seq) - str(
            record.seq).translate(validNucleotideChars).count("%")
        if numNonNucleotideChars:
            print(
                "Skipping record %s, containing non-nucleotide or ambiguous symbols '%s'"
                % (record.id, numNonNucleotideChars))
            skippedCount += 1
            continue

        # yeastgenome.org - skip suspected pseudo-genes
        if (args.variant == "yeastgenome"
                and record.description.find("Dubious ORF") != -1):
            skippedCount += 1
            continue

        # yeastgenome.org - skip mitochondrial genes
        if (args.variant == "yeastgenome"):
            geneType = record.id[0]
            if geneType == "Q" or geneType == "R":
                skippedCount += 1
                continue

        # yeastgenome.org - verify gene-id conforms to: http://www.yeastgenome.org/help/community/nomenclature-conventions
        if (args.variant == "yeastgenome"):
            geneId = record.id
            assert (reNuclearYeastGene.match(geneId))

        # Obtain attributes mapping
        attributes = []
        if (args.variant == "NCBI"):
            attributes = dict(re.findall(reNCBIattributes, record.description))

        if (args.variant == "NCBI"):
            if ('pseudo' in attributes and attributes['pseudo'] == 'true'):
                print("Skipping pseudo-gene entry %s" % (record.id, ))
                skippedCount += 1
                continue

        # Determine gene id
        proteinId = None
        additionalProteinIds = set()
        altProteinId = None
        if (args.variant == "yeastgenome"):
            proteinId = record.id
        elif (args.variant == "NCBI"):
            if (sequenceType == "shuffle" and not attributes):
                #Workaround for shuffle-seq files missing the header...
                #Extract the protein-id from sequence-id like this:
                #>lcl|NC_002516.2_cds_NP_064721.1_1
                if not args.alt_protein_ids:
                    proteinId = reNCBIbareheader.match(record.id).group(1)

                elif args.alt_protein_ids == "locus_tag":
                    if ('locus_tag' not in attributes):
                        print("Skipping entry %s missing locus_tag - %s" %
                              (record.id, attributes))
                        skippedCount += 1
                        continue
                    proteinId = attributes['locus_tag']
                    print(proteinId)
                else:
                    assert False

            else:
                # Note - not currently used
                #if 'db_xref' in attributes:
                #    _db_xrefs = attributes['db_xref'].split(",")
                #    db_xrefs = dict(map( lambda x: tuple(x.split(":")), _db_xrefs))
                if not args.alt_protein_ids:
                    if ('protein_id' not in attributes):
                        print("Skipping entry %s missing protein_id - %s" %
                              (record.id, attributes))
                        skippedCount += 1
                        continue

                    proteinId = attributes['protein_id']
                elif args.alt_protein_ids == "locus_tag":
                    if ('locus_tag' not in attributes):
                        print("Skipping entry %s missing locus_tag - %s" %
                              (record.id, attributes))
                        skippedCount += 1
                        continue
                    proteinId = attributes['locus_tag']

                    if ('protein_id' in attributes):
                        altProteinId = attributes['protein_id']

                else:
                    assert (False)

        elif (args.variant == "Ensembl"):
            # Sample id: ABD29211.1
            dotPos = record.id.rfind('.')
            if (dotPos > 3):
                proteinId = record.id[:dotPos]
                additionalProteinIds.add(
                    record.id
                )  # also allow matching the full format (including the transcript-id) - some CDS files include it...

            else:
                proteinId = record.id

        elif (args.variant == "JGI"):
            # Variant 1 (Phytozome, Mpus)
            #  (gff3):  60050
            #  (fasta): 60050
            # Variant 2 (Phytozome, Dsal)
            #  (gff3):  Dusal.1637s00001.1
            #  (fasta): Dusal.1637s00001.1
            # Variant 3:
            #  (gff3):  jgi.p|Ostta1115_2|10314
            #  (fasta): jgi|Ostta1115_2|10314|CE10313_131

            proteinId = record.id

            if record.id.startswith("jgi|"):
                parts = record.id.split('|')
                parts[0] = 'jgi.p'  # add the '.p'
                additionalProteinIds.add('|'.join(
                    parts[:3]))  # drop the suffix (parts[4])

        else:
            assert (False)

        if not args.ignore_id_check:
            assert (len(proteinId) > 2)

        # Skip sequences that have non-standard translations
        if (args.variant == "NCBI"):
            if "transl_except" in attributes:
                print("Skipping %s (because of transl_except)" % (proteinId, ))
                skippedCount += 1
                continue

        # If an inclusion list (white list) is defined, skip sequences missing from it
        if args.gene_ids_file:
            if (proteinId not in geneIdsToInclude):
                # Also try the additional ids
                if (not geneIdsToInclude.intersection(additionalProteinIds)):
                    print("Skipping %s (sequence %s, alternate ids=%s)" %
                          (proteinId, record.id, list(additionalProteinIds)))
                    skippedCount += 1
                    continue

        print("Inserting %s (sequence %s)..." % (proteinId, record.id))

        # Verify there are no duplicates entries
        if (proteinId in visitedProteinIds):
            print("MULTIPLE Entry: %s", proteinId)
            skippedCount += 1
            continue
        #assert(proteinId not in visitedProteinIds)
        visitedProteinIds.add(proteinId)

        # Write the filtered sequences into an output file (if needed)
        # Note - this also works in dry-run...
        if (args.output_fasta):
            outRecords.append(record)

        if (args.dry_run):
            continue

        if (sequenceType == "fixCDSkey"):
            cds = data_helpers.CDSHelper(taxId, proteinId)
            seqId = cds.seqId()
            if (not seqId is None):
                r.sadd(speciesCDSList % (taxId, ), proteinId)
            else:
                print("Couldn't find entry for proteinId=%s" % proteinId)

            continue  # Skip the rest of the processing...

        storeSeqInDB(nucSeq=record.seq,
                     taxId=taxId,
                     proteinId=proteinId,
                     seqSourceTag=seqSourceTag)

        cdsCount += 1

    if (notFoundCount + skippedCount > 0):
        print("Warning: %d entries skipped and %d entries not found" %
              (skippedCount, notFoundCount))

    print("Processed %d CDS entries" % (cdsCount, ))
    print("(out of %d CDS entries for this species)" %
          (r.scard("species:taxid:%d:CDS" % (taxId, ))))

    if (args.output_fasta):
        with open(args.output_fasta, "w") as outfile:
            out = SeqIO.write(outRecords, outfile, "fasta")