Example #1
0
def collect_data(file_path):
    os.chdir(file_path)
    extension = 'csv'
    all_files = [
        i for i in glob.glob('split_*_classification_results.{}'.format(
            extension))
    ]
    print(all_files)
    attributes_calculator = features.FeatureConstructor()
    ordered_attributes_list = list(attributes_calculator.attributes_map.keys())
    col = ["index"]
    for i in ordered_attributes_list:
        col.append(i)
    m = np.array([[c + m for m in ["Precision", "Recall", "F1", "AUC"]]
                  for c in ["RF_", "SVM_", "KNN_"]]).flatten()
    for i in m:
        col.append(i)
    print(ordered_attributes_list)
    df = pd.concat([
        pd.read_csv(f, sep=",", index_col=0, skiprows=1, names=col)
        for f in all_files
    ])
    df = df.reset_index()
    df = df[col[1:]]
    #print(df)
    #df["sum_of_neighbors"].hist(bins=100)
    return df
Example #2
0
    def calculate_classification_dataset(self):
        """
		Calculates the attributes for each example of the sample, and returns it as a matrix ready for applying the classification
		algorithms, in order to perform the link prediction.
		"""
        attributes_calculator = features.FeatureConstructor(
            self.graph_training)
        if self.attributes_list == {}:
            self.ordered_attributes_list = sorted(
                attributes_calculator.attributes_map.keys())
            for attribute in self.ordered_attributes_list:
                self.attributes_list[attribute] = {}

        classification_dataset = np.zeros(
            (self.sample_size, len(self.attributes_list) + 2))
        line_count = 0

        for line in self.sample_dataset:
            first_node, second_node, pair_class, pair_fold = line
            attributes_calculator.set_nodes(first_node, second_node)
            column = 0
            for function in self.ordered_attributes_list:
                parameters = self.attributes_list[function]
                classification_dataset[line_count][
                    column] = attributes_calculator.attributes_map[function](
                        **parameters)
                column += 1
            classification_dataset[line_count][-2] = pair_class
            classification_dataset[line_count][-1] = pair_fold

            line_count += 1

        return classification_dataset
 def get_features_inner(self,inp):
     s=inp[0]
     p_edges=inp[1]
     if not os.path.exists(f'results/{args.f}'):
         os.makedirs(f'results/{args.f}')
         #os.makedirs(f'results/{args.f}/positive')
     f=open(f'results/{args.f}/split_{s}_features.csv', 'w+')
     f.close()
     #f=open(f'results/{args.f}/positive/split_{s}_features.csv', 'w+')
     #f.close()
     attributes_calculator = features.FeatureConstructor(self.train_graph,self.page_rank)
     attributes_list={}
     if attributes_list == {}:
                 ordered_attributes_list = attributes_calculator.attributes_map.keys()
                 for attribute in ordered_attributes_list:
                     attributes_list[attribute] = {}
     line = 0
     for pair in self.test_edges:
             n1, n2  = pair
             attributes_calculator.set_nodes(n1, n2)
             column_values=np.zeros(len(ordered_attributes_list)+1)
             fet=attributes_calculator.get_features(pair)
             column_values[:-1]=fet
             #column_values[-3] = n1
             #column_values[-2] = n2
             column_values[-1] = 1#self.p_label[pair]
             line += 1
             with open(f'results/{args.f}/split_{s}_features.csv', 'a+') as file:
                 np.savetxt(file, [column_values], delimiter=",",fmt='%f')
                 file.close()
     c=0
     for pair in p_edges:
         if pair in self.test_edges:
             continue
         elif pair in self.train_edges:
             continue
         else:
             n1, n2  = pair
             attributes_calculator.set_nodes(n1, n2)
             column_values=np.zeros(len(ordered_attributes_list)+1)
             fet=attributes_calculator.get_features(pair)
             column_values[:-1]=fet
             #column_values[-3] = n1
             #column_values[-2] = n2
             column_values[-1] = 0#self.p_label[pair]
             line += 1
             with open(f'results/{args.f}/split_{s}_features.csv', 'a+') as file:
                 np.savetxt(file, [column_values], delimiter=",",fmt='%f')
                 file.close()
             c += 1
         if c>=len(self.test_edges):
             break
     return 1
def collect_data(file_path):
    os.chdir(file_path)
    extension = 'csv'
    all_files = [i for i in glob.glob('*_features.{}'.format(extension))]
    print(all_files)
    attributes_calculator = features.FeatureConstructor()
    ordered_attributes_list = list(attributes_calculator.attributes_map.keys())
    ordered_attributes_list.append("class")
    print(ordered_attributes_list)
    df = pd.concat([pd.read_csv(f,sep=",",names = ordered_attributes_list) for f in all_files])
    #print(df)
    #df["sum_of_neighbors"].hist(bins=100)
    return df
Example #5
0
	def set_classification_dataset(self):
		"""
		Calculates the attributes for each example of the sample, and returns it as a matrix ready for applying the classification
		algorithms, in order to perform the link prediction.
		"""
		self.classification_dataset = np.zeros((self.sample_size, len(self.attributes_list) + 2))
		line = 0
		attributes_calculator = features.FeatureConstructor(self.graph_training)
		for edge in self.positive_examples.union(self.negative_examples):
			first_node, second_node = edge
			attributes_calculator.set_nodes(first_node, second_node)
			pair_class = 0 if edge in self.negative_examples else 1
			column = 0
			for function in self.ordered_attributes_list:
				parameters = self.attributes_list[function]
				self.classification_dataset[line][column] = attributes_calculator.attributes_map[function](**parameters)
				column += 1
			self.classification_dataset[line][-2] = pair_class
			line += 1
		
		self.normalize_attributes()
		self.set_dataset_folds()
		return self.classification_dataset
    def expand_data(new_species_xlsx, output_hdf5, species_df_key, rxn_df_key,
                    elements_csv, bonds_csv, new_xlsx_path):
        """
        Helps to inject new data to the species dataframe as more CIDs are fetched manually
        :param new_species_xlsx: New Species xlsx file which stores newly fetched CIDs
        :param output_hdf5: Output HDF% file that houses species df
        :param species_df_key: Specied df key in output_hdf5 file
        :param rxn_df_key:
        :param elements_csv:
        :param bonds_csv:
        :param new_xlsx_path:
        :return: New data for ML experiments
        """

        # Reading xlsx files which contains newly fetched PubChem IDs into pandas df
        new_df_from_xlsx = pd.read_excel(new_species_xlsx, header=0)

        # Reading old Species dataframe to which new PubChem ids have to be transfered
        old_df_from_hdf = pd.read_hdf(output_hdf5, species_df_key)

        # Setting 'Species' name as index for efficiency
        old_df_from_hdf = old_df_from_hdf.reset_index()
        old_df_from_hdf = old_df_from_hdf.set_index(keys="Species",
                                                    verify_integrity=True)

        # Initializing FeatureConstructor
        my_constructor = ft.FeatureConstructor(elements_csv, bonds_csv)

        # Transfering CID, adding BondsInfo (stringified PubChem JSON), adding species feature vector
        new_species_count = 0
        for idx, row in new_df_from_xlsx.iterrows():
            if not math.isnan(row['CID']) and row['CID'] != "":
                if math.isnan(old_df_from_hdf.at[row['Species'], 'CID']
                              ) or old_df_from_hdf.at[row['Species'],
                                                      'CID'] == "":
                    old_df_from_hdf.at[row['Species'], 'CID'] = row['CID']
                    pubchem_str_json = my_constructor.get_full(row['CID'])
                    print("--Data fetched for CID {}--".format(int(
                        row['CID'])))
                    old_df_from_hdf.at[row['Species'],
                                       'BondsInfo'] = pubchem_str_json
                    old_df_from_hdf.at[
                        row['Species'],
                        'FeatureVector'] = my_constructor.bonds_count_json(
                            None, pubchem_str_json)
                    new_species_count = new_species_count + 1

        print('--Status--')
        print('--{} New Species Added--'.format(new_species_count))

        if new_species_count == 0:

            print(
                'No new changes were made as there were no new species to add.'
            )
            return

        else:

            # Updating HDF with updated species df
            old_df_from_hdf = old_df_from_hdf.reset_index()
            old_df_from_hdf = old_df_from_hdf.set_index(keys="SID",
                                                        verify_integrity=True)
            old_df_from_hdf.to_hdf(output_hdf5, species_df_key)

            # Updating Reactions DF with new CID list
            rm.RecordMapper.map_rid_to_cid(output_hdf5, rxn_df_key,
                                           species_df_key)

            # Filetring out reactions whose feature vectors can be calculated
            reduced_rxn_df = Extender.get_rxn_subset(output_hdf5, rxn_df_key)

            # Creating feature vectors of the filtered out reactions
            reduced_rxn_df = my_constructor.bond_brk(output_hdf5,
                                                     species_df_key,
                                                     reduced_rxn_df)

            print('--Status--')
            print('--Reactions Feature Vectors Created--')

            # Creating the new reactions xlsx for ML Training
            reduced_rxn_df.to_excel(new_xlsx_path)

            print('--Status--')
            print('--Database Expansion Routine Complete--')