Ejemplo n.º 1
0
    def setUp(self):
        logger.debug('SETUP TESTS...')

        in_file_lst = [
            r'../test/TestInput/test_crosscheck.xlsx',
            r'test/TestInput/test_crosscheck.xlsx',
            r'../TestInput/test_crosscheck.xlsx',
            r'TestInput/test_crosscheck.xlsx',
        ]
        in_file = ''
        for f in in_file_lst:
            if os.path.isfile(f):
                in_file = os.path.abspath(f)
                break
        logger.info(f'Input file {in_file}')
        bad_in_file = r'test/TestInput/test_crosscheck_x.txt'

        out_folder_lst = [
            r'../test/TestOutput/',
            r'test/TestOutput/',
            r'../TestOutput/',
            r'TestOutput/',
        ]
        out_folder = ''
        for p in out_folder_lst:
            if os.path.isdir(p):
                out_folder = os.path.abspath(p)
                break
        out_file = os.path.join(out_folder, 'test_crosscheck_output.csv')
        logger.info(f'Out put file will be: {out_file}')

        self.pass_params = ['-i', in_file, '-o', out_file]
        self.fail_input_params = ['-i', bad_in_file, '-o', out_file]
Ejemplo n.º 2
0
    def load_file(file: str) -> dict:

        if os.path.isfile(file):
            if file.lower().endswith('.xlsx'):
                abbr_df = pd.read_excel(file)
            elif file.lower().endswith('.csv'):
                abbr_df = pd.read_csv(file)
            elif file.lower().endswith('.tsv'):
                abbr_df = pd.read_csv(file, sep='\t')
            else:
                abbr_df = pd.DataFrame()
                logger.error(f'Can Not load file: {file}')
        else:
            raise FileNotFoundError
        abbr_df.fillna('', inplace=True)
        groups_lst = abbr_df.columns.tolist()
        logger.info(
            f'Input {len(groups_lst)} abbreviation groups: {", ".join(groups_lst)}'
        )

        abbr_dct = {}
        if not abbr_df.empty:
            for g in groups_lst:
                col_abbr_lst = abbr_df[g].unique().tolist()
                try:
                    col_abbr_lst.remove('')
                except ValueError:
                    pass
                abbr_dct[g] = col_abbr_lst

        return abbr_dct
Ejemplo n.º 3
0
def main(argv):
    """
    :param argv: -i <input epiLION abbreviation file in .txt format>
    """

    in_file = ''
    out_file = ''

    is_output = False

    try:
        opts, args = getopt.getopt(argv, "hi:o:", ["infile=", "outfile="])
        logger.debug(f'User input: {opts}, {args}')
    except getopt.GetoptError:
        logger.info('epiLION.py -i <input_file> -o <output_file>')
        return is_output
    for opt, arg in opts:
        if opt == '-h':
            logger.info('epiLION.py -i <input_file> -o <output_file>')
            return is_output
        elif opt in ("-i", "--infile"):
            in_file = arg
        elif opt in ("-o", "--outfile"):
            out_file = arg

    if os.path.isfile(in_file):
        logger.info(f'Load input file: {in_file}')
        with open(in_file, 'r') as in_obj:
            in_lst = in_obj.readlines()
            epilion2sdf(in_lst, out_file)
        logger.info(f'Save output file: {out_file}')
        logger.info('FINISHED')
        is_output = True
    else:
        logger.error(f'Can NOT open input file:')
        logger.error(in_file)
        logger.error('!!! FAILED to PROCESS !!!')

    return is_output
Ejemplo n.º 4
0
def parse_epilion(abbr: str) -> dict:

    fa_decoder = ParserFA()
    pl_decoder = ParserPL()

    info_dct = {}

    converter = Converter(abbr_cfg_path)
    epilion_id = converter.convert_abbr(abbr)

    if fa_decoder.is_fa(epilion_id):
        smi = fa_decoder.get_smi_fa(epilion_id)
        logger.info(epilion_id + ': ' + smi)
    elif pl_decoder.is_pl(epilion_id):
        smi = pl_decoder.get_smi_pl(epilion_id)
        logger.info(epilion_id + ': ' + smi)
    else:
        logger.info(f'Can NOT parse abbreviation: {epilion_id}')

    try:
        mol = Chem.MolFromSmiles(smi)
        AllChem.Compute2DCoords(mol)
        # m_mass = Descriptors.MolWt(mol)
        m_exactmass = rdMolDescriptors.CalcExactMolWt(mol)
        m_formula = rdMolDescriptors.CalcMolFormula(mol)
        img = Draw.MolToImage(mol, size=(600, 400))
        img_io = BytesIO()
        img.save(img_io, format='png')
        img_io.seek(0)
        img.save(img_io, format='png')
        img_data = base64.b64encode(img_io.getbuffer())
        img_data_url = r'data:image/png;base64,' + img_data.decode("utf-8")

        info_dct['id'] = epilion_id
        info_dct['formula'] = m_formula
        info_dct['exactmass'] = '%.4f' % m_exactmass
        info_dct['img'] = img_data_url

    except Exception as e:
        logger.error(f'! FAILED: {epilion_id}')
        logger.error(f'! FAILED to generate structure from SMILES: {smi}')
        logger.error(e)

    return info_dct
Ejemplo n.º 5
0
        return epilion_lst

    def convert_text(self, input_text: str) -> (dict, list):

        usr_abbr_lst = input_text.split('\n')

        epilion_dct = {}
        bad_input_lst = []

        for abbr in usr_abbr_lst:
            epilion_id = self.convert_abbr(abbr)
            if epilion_id:
                epilion_dct[abbr] = epilion_id
            else:
                bad_input_lst.append(abbr)

        return epilion_dct, bad_input_lst


if __name__ == '__main__':

    test_in_file = r'../test/TestInput/test_crosscheck.xlsx'
    test_out_file = r'../test/TestOutput/test_crosscheck_output.xlsx'
    cfg_file = r'../configurations/LinearFA_abbreviations.xlsx'

    converter = Converter(cfg_file)

    converter.convert_table(test_in_file, test_out_file)

    logger.info('epiLion converter finished.')
Ejemplo n.º 6
0
def epilion2sdf(abbr_lst, save_sdf):

    if isinstance(abbr_lst, str):
        try:
            if os.path.isfile(abbr_lst):
                logger.info(f'Try to open file: {abbr_lst}')
                with open(abbr_lst, 'r') as infile_obj:
                    abbr_lst = infile_obj.readlines()
            else:
                logger.error(f'Can NOT load input: {abbr_lst}')
                logger.info('!! END PROCESSING !!')
                exit()
        except Exception as e:
            logger.error(f'Can NOT load input: {abbr_lst}')
            logger.error(e)

    fa_decoder = ParserFA()
    pl_decoder = ParserPL()

    info_dct = {}

    for abbr in abbr_lst:
        logger.info(abbr)
        if fa_decoder.is_fa(abbr):
            smi = fa_decoder.get_smi_fa(abbr)
            logger.info(abbr + ': ' + smi)
            info_dct[abbr] = smi
        elif pl_decoder.is_pl(abbr):
            smi = pl_decoder.get_smi_pl(abbr)
            logger.info(abbr + ': ' + smi)
            info_dct[abbr] = smi
        else:
            logger.info(f'Can NOT parse abbreviation: {abbr}')

    sdf_writer = Chem.SDWriter(open(save_sdf, mode='w'))

    for m in abbr_lst:
        if m in info_dct:
            smi = info_dct[m]
            try:
                mol = Chem.MolFromSmiles(smi)
                AllChem.Compute2DCoords(mol)
                mol.SetProp('_Name', m)
                m_mass = Descriptors.MolWt(mol)
                m_exactmass = rdMolDescriptors.CalcExactMolWt(mol)
                m_formula = rdMolDescriptors.CalcMolFormula(mol)
                mol.SetProp('EXACT_MASS', '%.6f' % m_exactmass)
                mol.SetProp('NOMINAL_MASS', '%.3f' % m_mass)
                mol.SetProp('FORMULA', m_formula)
                sdf_writer.write(mol)
            except Exception as e:
                logger.error(f'! FAILED: {m}')
                logger.error(
                    f'! FAILED to generate structure from SMILES: {smi}')
                logger.error(e)
        else:
            logger.warning(f'!! Can NOT parse: {m}')
Ejemplo n.º 7
0
                mol.SetProp('_Name', m)
                m_mass = Descriptors.MolWt(mol)
                m_exactmass = rdMolDescriptors.CalcExactMolWt(mol)
                m_formula = rdMolDescriptors.CalcMolFormula(mol)
                mol.SetProp('EXACT_MASS', '%.6f' % m_exactmass)
                mol.SetProp('NOMINAL_MASS', '%.3f' % m_mass)
                mol.SetProp('FORMULA', m_formula)
                sdf_writer.write(mol)
            except Exception as e:
                logger.error(f'! FAILED: {m}')
                logger.error(
                    f'! FAILED to generate structure from SMILES: {smi}')
                logger.error(e)
        else:
            logger.warning(f'!! Can NOT parse: {m}')


if __name__ == '__main__':

    test_file = r'../../test/TestInput/test_names.txt'
    output_file = r'../../test/TestOutput/test_names_sdf.sdf'

    with open(test_file, 'r') as input_obj:
        input_lst = input_obj.readlines()
        epilion2sdf(input_lst, output_file)

    epilion2sdf(test_file, output_file)
    epilion2sdf(input_lst, output_file)

    logger.info('FINISHED!')
Ejemplo n.º 8
0
            raise Exception('test bad parameter... Failed')

    def test_epiLION_Converter_bad_input(self):
        logger.debug('test bad input...')
        result = convLION.main(self.fail_input_params)
        if result is False:
            logger.debug('test bad input... PASSED')
        else:
            raise Exception('test bad input... Failed')

    def test_epiLION_Converter_good_input(self):
        logger.debug('test sample data...')
        result = convLION.main(self.pass_params)
        if result is True:
            logger.debug('test sample data... PASSED')
        else:
            raise Exception('test sample data... Failed')

    def tearDown(self):
        logger.debug('TEST END!')


if __name__ == '__main__':
    # python convLION.py -i test/TestInput/test_crosscheck.xlsx -o test/TestOutput/test_crosscheck_output.xlsx

    epiLION_Path = os.path.dirname(os.path.abspath(__file__))
    sys.path.insert(0, epiLION_Path + '/../')

    unittest.main()
    logger.info('TESTS FINISHED!')
Ejemplo n.º 9
0
def main(argv):
    """
    :param argv: -i <input epiLION abbreviation file in .txt format>
    """

    in_file = ''
    out_file = ''

    is_output = False

    try:
        opts, args = getopt.getopt(argv, "hi:o:", ["infile=", "outfile="])
        logger.debug(f'User input: {opts}, {args}')
    except getopt.GetoptError:
        logger.info('epiLIONConverter.py -i <input_file> -o <output_file>')
        return is_output
    for opt, arg in opts:
        if opt == '-h':
            logger.info('epiLIONConverter.py -i <input_file> -o <output_file>')
            return is_output
        elif opt in ("-i", "--infile"):
            in_file = arg
        elif opt in ("-o", "--outfile"):
            out_file = arg

    if os.path.isfile(in_file):
        logger.info(f'Load input file: {in_file}')
        converter = Converter(abbr_cfg_path)
        converter.convert_table(in_file, out_file)

        logger.info(f'Save output file: {out_file}')
        logger.info('FINISHED')
        is_output = True
        logger.info(f'is_output {is_output}')
    else:
        logger.error(f'Can NOT open input file:')
        logger.error(in_file)
        logger.error('!!! FAILED to PROCESS !!!')

    return is_output
Ejemplo n.º 10
0
    def get_smi_fa(self, abbr: str) -> str:

        smi = None

        fa_info_dct = self.decode_fa(abbr)

        if fa_info_dct['NUM_C']:
            c_chain_lst = [''] + ['C'] * int(fa_info_dct['NUM_C'])
            c_term_lst = []

            if fa_info_dct['LINK']:
                if fa_info_dct['LINK'] == 'O-':
                    c_chain_lst[0] = 'O'
                elif fa_info_dct['LINK'] == 'P-':
                    c_chain_lst[0] = r'O'
                    c_chain_lst[1] = r'\C='
                    c_chain_lst[2] = r'C/'
                else:
                    c_chain_lst[0] = 'O'
                    c_chain_lst[1] = 'C('
                    c_term_lst.append(')=O')
            else:
                c_chain_lst[0] = 'O'
                c_chain_lst[1] = 'C('
                c_term_lst.append(')=O')

            c_idx_lst = list(range(3, int(fa_info_dct['NUM_C']) + 1))

            if int(fa_info_dct['NUM_DB']
                   ) > 0 and fa_info_dct['MOD_INFO'] is None:
                fa_info_dct['MOD'] = [{
                    'NUM': fa_info_dct['NUM_DB'],
                    'MOD': 'DB',
                    'SITE_INFO': None,
                    'SITE': None
                }]

            logger.info(fa_info_dct['MOD'])

            for _mod in fa_info_dct['MOD']:
                _mod_code = _mod['MOD']
                if _mod['SITE']:
                    for _site in _mod['SITE']:
                        _idx = int(_site['SITE'])
                        site_code = mod_cfg_df.loc[_mod_code, 'SMI_SITE']
                        site_post_code = mod_cfg_df.loc[_mod_code, 'SMI_POST']
                        site_term_code = mod_cfg_df.loc[_mod_code,
                                                        'SMI_TERMINAL']
                        if isinstance(site_code, str):
                            c_chain_lst[_idx] = site_code
                        if isinstance(site_post_code, str):
                            c_chain_lst[_idx + 1] = site_post_code
                        if isinstance(site_term_code, str):
                            c_term_lst.append(site_term_code)
                else:
                    if _mod_code in ['DB', 'Ep']:
                        site_code = mod_cfg_df.loc[_mod_code, 'SMI_SITE']
                        site_post_code = mod_cfg_df.loc[_mod_code, 'SMI_POST']
                        _mod_count = int(_mod['NUM'])
                        _used_idx_lst = []
                        c_mod_idx = c_idx_lst[0]
                        _counter = 1
                        c_shift = 3
                        if c_shift * _mod_count > int(
                                fa_info_dct['NUM_C']) - 2:
                            c_shift = 2  # if more C=C in chain and no bis-allylic position
                            logger.info(
                                'Too many C=C, try to remove bis-allylic positions'
                            )
                        while _counter <= _mod_count:
                            if c_mod_idx in c_idx_lst and c_mod_idx + 1 in c_idx_lst:
                                logger.info(c_mod_idx)
                                c_chain_lst[c_mod_idx] = site_code
                                c_chain_lst[c_mod_idx + 1] = site_post_code
                                _used_idx_lst.extend(
                                    [c_mod_idx, c_mod_idx + 1])
                                c_idx_lst = [
                                    x for x in c_idx_lst
                                    if x not in _used_idx_lst
                                ]
                                c_mod_idx += c_shift
                                _counter += 1
                            else:
                                c_mod_idx += 1
                    else:
                        site_code = mod_cfg_df.loc[_mod_code, 'SMI_SITE']
                        site_term_code = mod_cfg_df.loc[_mod_code,
                                                        'SMI_TERMINAL']
                        _mod_count = int(_mod['NUM'])
                        _used_idx_lst = []
                        c_mod_idx = c_idx_lst[0]
                        _counter = 1
                        while _counter <= _mod_count:
                            if c_mod_idx in c_idx_lst:
                                c_chain_lst[c_mod_idx] = site_code
                                if isinstance(site_term_code, str):
                                    c_term_lst.append(site_term_code)
                                c_idx_lst.remove(c_mod_idx)
                                _counter += 1
                            else:
                                c_mod_idx += 1

        else:
            c_chain_lst = []
            c_term_lst = []

        if c_term_lst:
            c_chain_lst.extend(sorted(c_term_lst, reverse=True))

        if c_chain_lst:
            smi = ''.join(c_chain_lst)

        smi = re.sub(r'\\/', r'\\', smi)
        smi = re.sub(r'/\\', r'/', smi)

        return smi
Ejemplo n.º 11
0
    fa_lst = [
        'FA18:0',
        '18:1',
        'O-16:0',
        'P-18:0',
        '20:4[4DB,2OH,1Ke]',
        '20:4[4DB{5,9,12,15},2OH{8,11},1Ke{14}]',
        '20:4[4DB{5Z,9E,12E,15E},2OH{8S,11R},1Ke{14}]',
        '20:4[4DB{5Z,9E,11Z,14Z},1OH{8S}]',
        '9:0<CHO{@9C}>',
        # '20:1[PGA{8a,12b},1DB{13Z},1OH{15S}]'
    ]

    for _abbr in fa_lst:
        fa = fa_decoder.decode_fa(_abbr)
        logger.info(fa)
        _smi = fa_decoder.get_smi_fa(_abbr)
        logger.info(_abbr + ': ' + _smi)

    pl_lst = [
        r'PC(O-16:0/18:1)',
        r'PC(P-16:0_18:1)',
        r'PC(P-16:0/18:1)',
        'PC(16:0/20:4[4DB,2OH,1Ke])',
        'PC(16:0/20:4[4DB{5,9,12,15},2OH{8,11},1Ke{14}])',
        'PC(16:0/20:4[4DB{5Z,9E,12E,15E},2OH{8S,11R},1Ke{14}])',
    ]

    for _abbr in pl_lst:
        logger.info(_abbr)
        pl = pl_decoder.decode_pl(_abbr)