Beispiel #1
0
 def get_build_num(self, dir=None, is_ret_int=True):
     if dir is None:
         raise ValueError('dir is empty.')
     version_info = VersionManager(dir)
     if is_ret_int:
         build_num = int(version_info.build)
     else:
         build_num = version_info.build
     return build_num
Beispiel #2
0
    def merge(self):
        fv_tissue = list()

        # build merged fv list by tissue: fv_tissue
        for fv_set in self.fv:
            for idx, file in enumerate(fv_set):
                if len(fv_tissue) <= idx:
                    fv_tissue.append([file])
                else:
                    fv_tissue[idx].append(file)

        # merge feature vectors' contents
        for files in fv_tissue:     # for all 23 tissues
            merged = list()
            tissue = None
            label_name = ''

            # step 1: set fv contents with list() * dict()
            fv_dicts = list()   # list() * dict() * len(files)
            for file in files:
                file_path = file.path + file.file_name
                tmp_dict = CSVLoader.csv2dict(filename=file_path)
                fv_dicts.append(tmp_dict)
                if tissue is None:
                    tissue = int(file.tissue)
                build_num = VersionManager(file.version).build
                label_name += '-' + file.label_type + file.label_name + '_' + str(build_num)

            # step 2: merge
            # 2.1 get common gnids
            common_gnids = list()
            for fv_dict in fv_dicts:
                gnids = list(fv_dict)
                if len(common_gnids) == 0:
                    common_gnids = gnids.copy()
                else:
                    common_gnids = ListTool.common_items(common_gnids, gnids)
            # 2.2 merge
            for gnid in common_gnids:
                fv_m = list()
                for fv_dict in fv_dicts:
                    fv_m += fv_dict[gnid][:-1]
                new_row = [str(gnid)] + fv_m + ['1' if sum([int(fv[gnid][-1]) for fv in fv_dicts]) >= 2 else '0']
                #new_row = [str(gnid)] + fv_m + ['1' if sum([int(fv[gnid][-1]) for fv in fv_dicts]) >= 1 else '0']
                merged.append(new_row)

            # show total number of gnids and possitive class
            print('total gnids: {}\tpositive class#: {}'.format(len(merged), sum([int(x[-1]) for x in merged])))

            # set FvFile with merged FV
            file_name = 'maize_gp_mt%02d%s_%s.arff' % (tissue, label_name, settings.DEV_VERSION)
            path = settings.RESULT_DIR + settings.DEV_VERSION + '/0/1/'
            self.fv_merged.append(FvFile(file_name=file_name, path=path, data=merged))
            print('\t\t===> ' + path + file_name)
Beispiel #3
0
def main(argv):
    src = None
    dst = None
    k_size = None

    args = parser.parse_args(argv[1:])

    if len(argv) <= 1:
        parser.parse_args(['--help'])
        return

    ## get kmer size
    #if args.ksize:
    #    k_size = args.ksize

    # get source dir
    if args.src:
        src = args.src
        if not os.path.exists(src):
            raise FileNotFoundError('{} does not exist.'.format(src))
        # set target dir
        # get destination dir
        if args.dst:
            dst = args.dst
        else:
            dst = src + '../0_arffs/'  # default

    print('version:', settings.DEV_VERSION)
    print("source directory:", src)
    print("destination directory:", dst)

    #
    # Rename and aggregating to the destination dir
    #
    all_dirs_info = FvManager.get_feature_dirs(src)
    all_dirs_info.sort()
    for dir in all_dirs_info:
        version_info = VersionManager(version_str=dir)
        src_arffs_dir = src + dir + '/0/1/'
        files_list = FileList.ls(path=src_arffs_dir,
                                 recursion=False,
                                 mode=settings.FLS_FILE_ONLY,
                                 ext=settings.FLS_EXT_ARFF)
        for file in files_list:
            if file.get_ext().lower() == settings.FLS_EXT_ARFF:
                #print(file.full)
                new_file = dst + file.name
                FileManager.dir_create(dst)
                copyfile(src=file.full, dst=new_file)
                print(new_file)
Beispiel #4
0
    def create_arff(self):
        # get feature names
        feature_names = list()
        version = VersionManager(version_str=settings.DEV_VERSION)
        for feature in self.features:
            feature_names.append(feature.name.strip())

        # write arff file for each feature
        for feature in self.features:
            file_name = 'maize_gp_{BD}_K{KM}_{FN}'.format(
                BD=version.build, KM=self.kmer_size, FN=feature.name.strip())
            file_path = '{RD}{VER}/{IT}/{RS}/{FI}.arff'.format(
                RD=settings.RESULT_DIR,
                VER=settings.DEV_VERSION,
                KM=self.kmer_size,
                IT=self.debug_mode[1],
                RS=self.debug_mode[0],
                FI=file_name)
            f = FileManager.file_open(file_path, 'w')

            # arff header
            #f.write('@relation maize-gp-%s\n' % feature.name.strip())
            f.write('@relation %s\n' % file_name)
            for feature_attr in feature_names:
                f.write('@attribute %s numeric\n' % feature_attr)
            f.write('@attribute class {1,0}\n')
            f.write('@data\n')

            # arff data
            # gnids
            gnids = self.exp_setting.get_gene_dataset_gnids_list(
                feature_id=feature.corresp_tissue)
            for gnid in gnids:
                vector = self.feature_vector.get(gnid, None)
                if vector is not None:
                    line = ",".join(str(value) for value in vector)
                    predicted_results = feature.prediction_results.get(gnid)
                    if predicted_results is None:
                        data_label = '?'
                    else:
                        data_label = predicted_results.get_assigned_class()
                    f.write("%s,%s\n" % (line, data_label))
            f.close()
def main(argv):
    builds_list = None
    source_dir_path = None
    main_exp_num = None
    seq_type = None

    args = parser.parse_args(argv[1:])

    if len(argv) <= 1:
        parser.parse_args(['--help'])
        return

    # get seq type
    if args.seq_type:
        seq_type_str = args.seq_type
        if seq_type_str.lower() in ['p', 'protein', 'peptide']:
            seq_type = 'Protein'
        elif seq_type_str.lower() in ['d', 'dna']:
            seq_type = 'DNA'
        elif seq_type_str.lower() in ['m', 'promoter', 'pmt', 'pm']:
            seq_type = 'Promoter'
        elif seq_type_str.lower() in ['r', 'rda', 'ra']:
            seq_type = 'Reduced Alphabet'
        else:
            seq_type = 'Unknown - ' + seq_type_str
        print('seq type: {}'.format(seq_type))

    # get features number to combine
    if args.builds:
        arg_str = args.builds[0]
        print(arg_str)
        if arg_str.lower() == 'all':
            builds_list = None
        else:
            builds_list = StrTool.ranges2list(arg_str)
    else:
        builds_list = None  # default: all
        print('Builds: {}'.format(builds_list))

    # get source directory
    if args.src:
        source_dir_path = args.src
        main_exp_num = source_dir_path[-20:-11]
        if not os.path.exists(source_dir_path):
            raise FileNotFoundError(
                '{} does not exist.'.format(source_dir_path))
    print('source directory:', source_dir_path)

    #
    # build aggregated results summary
    #
    aggregated_summary = None
    exp_num = None
    all_dirs_info = FvManager.get_feature_dirs(source_dir_path)
    all_dirs_info.sort()
    for dir in all_dirs_info:
        version_info = VersionManager(version_str=dir)
        if builds_list is None or version_info.build in builds_list:
            prediction_summary_dir = source_dir_path + dir + '/0/'
            # get prediction summary file
            files_list = FileList.ls(path=prediction_summary_dir,
                                     recursion=False,
                                     mode=settings.FLS_FILE_ONLY)
            for file in files_list:
                print(file.full)
                #print(file.version())
                # read summary file and get detail information
                #   tissue#, label_type, label_name
                summary_list = CSVLoader.csv2list(file_name=file.full)

                # add overall rows
                for row in summary_list[1:]:
                    if row[3] == 'overall':
                        tissue = File.tissue_info(row[2])
                        label_type = File.label_type(row[2])
                        label_name = File.label_name(row[2])
                        exp_num = version_info.build

                        # set header with additional columns (seq_type, label_type, label_name, tissue#)
                        if aggregated_summary is None:
                            header = summary_list[0]
                            if seq_type == "Reduced Alphabet":
                                header[0:1] = ['Exp_Num', 'Mapping ID']
                                header[3:1] = [
                                    'seq_type', 'label_type', 'label_name',
                                    'tissue_num'
                                ]
                            else:
                                header[0] = 'Exp_Num'
                                header[2:1] = [
                                    'seq_type', 'label_type', 'label_name',
                                    'tissue_num'
                                ]
                            aggregated_summary = list()
                            aggregated_summary.append(header)
                        data = row
                        if seq_type == "Reduced Alphabet":
                            data[0:1] = [exp_num, data[0]]
                            data[3:1] = [
                                seq_type, label_type, label_name, tissue
                            ]
                        else:
                            data[0] = exp_num
                            data[2:1] = [
                                seq_type, label_type, label_name, tissue
                            ]
                        aggregated_summary.append(data)

    # write aggregated summary
    if seq_type == "Reduced Alphabet":
        file_name = os.path.join(
            source_dir_path + '../1_summary/',
            'prediction_summary_aggregated_RA_' + settings.DEV_VERSION +
            '.csv')
    else:
        file_name = os.path.join(
            source_dir_path + '../1_summary/',
            'prediction_summary_aggregated_' + main_exp_num + '_' +
            settings.DEV_VERSION + '.csv')
    CSVLoader.list2csv(list_data=aggregated_summary, filename=file_name)
    print('{} has been created.'.format(file_name))
Beispiel #6
0
 def set_version(self, version):
     self.version = VersionManager(version)
Beispiel #7
0
 def get_feature_dirs(loc):
     dirs = os.listdir(loc)
     feature_dirs = [dir for dir in dirs if VersionManager.validate_pattern(dir)]
     return feature_dirs