Esempio n. 1
0
    def featurize(self):
        client = py_cdrive_api.Client(access_token=self.access_token)
        try:
            client.delete(self.output_dir + '/featurizer')
        except py_cdrive_api.ForbiddenAccessException as e:
            pass 
        client.create_folder(self.output_dir, 'featurizer')
        client.create_folder(self.output_dir, 'catalog')

        client.download_file(self.input_dir + '/catalog/catalog.csv', '/storage')
        client.upload('/storage/catalog.csv', self.output_dir + '/catalog/catalog.csv')

        sm_job = SMJob.objects.filter(uid=self.uid)[0]
        sm_job.stage = "Featurizer"
        sm_job.status = "Running"
        sm_job.long_status = "Initializing"
        sm_job.save()
        featurizer_input = pd.DataFrame()
        featurizer_input[['id', 'l_id', 'r_id']] = self.block_frame[['id', 'l_id', 'r_id']]
        featurizer_input = featurizer_input.append(self.seed_examples, ignore_index=True)
        featurizer_input = featurizer_input.astype(int)
        featurizer_input.to_csv(settings.DATA_PATH + '/' + self.uid + '/featurizer-input.csv', index=False)
        client.upload(settings.DATA_PATH + '/' + self.uid + '/featurizer-input.csv', self.output_dir + '/featurizer')

        data = {
            'inputFolderPath': self.output_dir,
            'outputPath': self.output_dir + '/featurizer',
            'outputName': 'features.csv',
            'imageUrl': self.featurizer_url,
            'workers': self.featurizer_replicas
        }
        featurizer_base_url = 'http://mapper-' + os.environ['COLUMBUS_USERNAME'] + '/'
        response = requests.post(url=featurizer_base_url + 'create', data=data, headers={'Authorization': self.auth_header})
        featurizer_id = response.json()['uid']
        response = requests.post(url=featurizer_base_url + 'status?uid=' + featurizer_id, stream=True)

        for line in response.iter_lines():
            if line:
                status_json = json.loads(line.decode('utf-8'))
                if "conditions" in status_json and status_json["conditions"][0]["type"] == "Complete":
                    attempts = 0
                    while(True):
                        try:
                            features_url = client.file_url(self.output_dir + '/featurizer/features.csv')
                            break
                        except Exception as e:
                            attempts += 1
                            if attempts > 10:
                                raise(e)
                            else:
                                time.sleep(2)
                    self.features_frame = pd.read_csv(features_url).sort_values('id').reset_index(drop=True)
                    response = requests.post(url=featurizer_base_url + 'delete', data={'uid':featurizer_id}, headers={'Authorization': self.auth_header})
                    return True
                else:
                    if sm_job.long_status != 'Executing' :
                        sm_job.status = 'Running'
                        sm_job.long_status = 'Executing'
                        sm_job.save()
Esempio n. 2
0
 def block(self):
     client = py_cdrive_api.Client(access_token=self.access_token)
     try:
         client.delete(self.output_dir + '/blocker')
     except py_cdrive_api.ForbiddenAccessException as e:
         pass 
     client.create_folder(self.output_dir, 'blocker')
     sm_job = SMJob.objects.filter(uid=self.uid)[0]
     sm_job.stage = "Blocking"
     sm_job.status = "Running"
     sm_job.long_status = "Initializing"
     sm_job.long_status = "Initializing"
     sm_job.save()
     data = {
         'inputFolderPath': self.output_dir + '/profiler',
         'outputPath': self.output_dir + '/blocker',
         'outputName': 'candidates.csv',
         'imageUrl': self.blocker_url,
         'workers': self.blocker_replicas
     }
     blocker_base_url = 'http://mapper-' + os.environ['COLUMBUS_USERNAME'] + '/'
     response = requests.post(url=blocker_base_url + 'create', data=data, headers={'Authorization': self.auth_header})
     blocker_id = response.json()['uid']
     response = requests.post(url=blocker_base_url + 'status?uid=' + blocker_id, stream=True)
     for line in response.iter_lines():
         if line:
             status_json = json.loads(line.decode('utf-8'))
             if "conditions" in status_json and status_json["conditions"][0]["type"] == "Complete":
                 candidates_path = '{}/blocker'.format(self.output_dir)
                 candidates_file = '{}/candidates.csv'.format(candidates_path)
                 attempts = 0
                 while(True):
                     try:
                         block_url = client.file_url(candidates_file)
                         break
                     except Exception as e:
                         attempts += 1
                         if attempts > 10:
                             raise(e)
                         else:
                             time.sleep(2)
                 self.block_frame = pd.read_csv(block_url)
                 client.delete(candidates_file)
                 del self.block_frame['Unnamed: 0']
                 self.block_frame.insert(0, 'id', range(1, 1+len(self.block_frame)))
                 self.block_frame.to_csv('/storage/candidates.csv', index=False)
                 client.upload('/storage/candidates.csv', candidates_path)
                 response = requests.post(url=blocker_base_url + 'delete', data={'uid':blocker_id}, headers={'Authorization': self.auth_header})
                 return True
             else:
                 if sm_job.long_status != 'Executing' :
                     sm_job.status = 'Running'
                     sm_job.long_status = 'Executing'
                     sm_job.save()
Esempio n. 3
0
    def profile(self):
        client = py_cdrive_api.Client(access_token=self.access_token)
        try:
            client.delete(self.output_dir + '/profiler')
        except py_cdrive_api.ForbiddenAccessException as e:
            pass
        client.create_folder(self.output_dir, 'profiler')
        data = {
            'inputFolderPath': self.input_dir,
            'outputPath': self.output_dir + '/profiler',
            'outputName': 'traits.csv',
            'imageUrl': self.profiler_url,
            'workers': self.profiler_replicas
        }
        profiler_base_url = 'http://mapper-' + os.environ['COLUMBUS_USERNAME'] + '/'
        response = requests.post(url=profiler_base_url + 'create', data=data, headers={'Authorization': self.auth_header})
        profiler_id = response.json()['uid']
        sm_job = SMJob.objects.filter(uid=self.uid)[0]

        response = requests.post(url=profiler_base_url + 'status?uid=' + profiler_id, stream=True)
        for line in response.iter_lines():
            if line:
                status_json = json.loads(line.decode('utf-8'))
                if "conditions" in status_json and status_json["conditions"][0]["type"] == "Complete":
                    traits_path = '{}/profiler'.format(self.output_dir)
                    traits_file = '{}/traits.csv'.format(traits_path)
                    attempts = 0
                    while(True):
                        try:
                            profile_url = client.file_url(traits_file)
                            break
                        except Exception as e:
                            attempts += 1
                            if attempts > 10:
                                raise(e)
                            else:
                                time.sleep(2)
                    self.profile_frame = pd.read_csv(profile_url)
                    client.delete(traits_file)
                    del self.profile_frame['column_id']
                    self.profile_frame['column_id'] = self.profile_frame.index + 1
                    self.profile_frame.to_csv('/storage/traits.csv', index=False)
                    client.upload('/storage/traits.csv', traits_path)
                    self.profile_frame.rename({'column_id': 'id'}, axis='columns', inplace=True)
                    response = requests.post(url=profiler_base_url + 'delete', data={'uid':profiler_id}, headers={'Authorization': self.auth_header})
                    return True
                else:
                    if sm_job.long_status != 'Executing' :
                        sm_job.status = 'Running'
                        sm_job.long_status = 'Executing'
                        sm_job.save()
Esempio n. 4
0
 def apply_model(self, path):
     X_test = self.features_frame[:-5]
     del X_test['id']
     predictions = self.block_frame.copy()
     predictions['label'] = self.model.predict(X_test)
     predictions = predictions[predictions['label'] == 1]
     del predictions['label']
     del predictions['id']
     predictions.insert(0, 'id', range(1, 1 + len(predictions)))
     file_name = 'matches.csv'
     file_path = settings.DATA_PATH + '/' + self.uid + '/' + file_name
     predictions.to_csv(file_path, index=False)
     client = py_cdrive_api.Client(access_token=self.access_token)
     client.upload(file_path, path)
Esempio n. 5
0
 def complete_iteration(self): 
     sm_job = SMJob.objects.filter(uid=self.uid)[0]
     sm_job.stage = 'Active Learning'
     sm_job.status = 'Running'
     sm_job.long_status = 'Iteration ' + str(self.current_iteration) + '/' + str(self.iterations) 
     sm_job.save()
     file_path = self.output_dir + '/learner/iteration-' + str(self.current_iteration) + '-' + self.uid + '-labeled.csv'
     client = py_cdrive_api.Client(access_token=self.access_token)
     file_url = client.file_url(file_path)
     new_examples = pd.read_csv(file_url)
     new_examples['label'] = new_examples['label'].map({'Yes': 1, 'No': 0})
     self.train = pd.concat([self.train, new_examples]).astype(int)
     self.run_iteration()
     return os.environ['CDRIVE_URL'] + 'app/' + os.environ['COLUMBUS_USERNAME'] + '/lynx/job/' + self.uid
Esempio n. 6
0
 def calculate_accuracy(self):
     client = py_cdrive_api.Client(access_token=self.access_token)
     predictions = pd.read_csv(client.file_url(self.output_dir + '/apply-model/matches.csv'))
     index = self.gold.set_index(list(self.gold.columns)).index
     predictions.set_index(list(self.gold.columns), inplace=True)
     mask1 = predictions.index.isin(index)
     inverted_columns = list(map(lambda x: 'r_' + x[2:] if x.startswith('l_') else 'l_' + x[2:], list(self.gold.columns)))
     predictions.reset_index(inplace=True)
     predictions.set_index(inverted_columns, inplace=True)
     mask2 = predictions.index.isin(index)
     predictions['ground_truth'] = mask1 | mask2
     precision = relevant_docs/len(predictions)
     recall = relevant_docs/(2*len(self.gold))
     f1_score = 2/(1/precision + 1/recall)
     return ({'precision': precision, 'recall': recall, 'f1Score': f1_score})
Esempio n. 7
0
 def init_learner(self):
     client = py_cdrive_api.Client(access_token=self.access_token)
     try:
         client.delete(self.output_dir + '/learner')
     except py_cdrive_api.ForbiddenAccessException as e:
         pass 
     client.create_folder(self.output_dir, 'learner')
     sm_job = SMJob.objects.filter(uid=self.uid)[0]
     sm_job.stage = "Active Learning"
     sm_job.status = "Running"
     sm_job.long_status = "Initializing"
     sm_job.save()
     truncated_profiles = self.profile_frame[['id', 'dataset', 'column', 'sample']]
     truncated_profiles.to_csv(settings.DATA_PATH + '/' + self.uid + '/truncated-profiles.csv', index=False)
     client.upload(settings.DATA_PATH + '/' + self.uid + '/truncated-profiles.csv', self.output_dir + '/learner')
     client.upload('/options.json', self.output_dir + '/learner')
     if self.gold_path is not None:
         self.gold = pd.read_csv(client.file_url(self.gold_path))
     self.create_labeling_task(self.seed_examples.tail(3))
Esempio n. 8
0
    def post(self, request):
        auth_header = request.META['HTTP_AUTHORIZATION']
        token = auth_header.split()[1]
        config_string = request.data['config']
        config_name = request.data['configName']
        client = None
        try:
            client = py_cdrive_api.Client(access_token=token)
            client.delete('users/' + os.environ['COLUMBUS_USERNAME'] +
                          '/apps/lynx/' + config_name)
        except py_cdrive_api.UnauthorizedAccessException as e:
            return Response(status=status.HTTP_401_UNAUTHORIZED)
        except py_cdrive_api.ForbiddenAccessException as e:
            pass

        client.create_file(cdrive_path='users/' +
                           os.environ['COLUMBUS_USERNAME'] + '/apps/lynx',
                           content=config_string,
                           file_name=config_name)
        return Response(status=status.HTTP_200_OK)
Esempio n. 9
0
 def run_iteration(self):
     self.current_iteration = self.current_iteration + 1
     sm_job = SMJob.objects.filter(uid=self.uid)[0]
     sm_job.stage = "Active Learning"
     sm_job.status = "Running"
     sm_job.long_status = 'Iteration ' + str(self.current_iteration) + '/' + str(self.iterations) 
     sm_job.iteration = self.current_iteration
     sm_job.save()
     import pdb
     pdb.set_trace()
     self.train = self.train.sort_values('id').reset_index(drop=True)
     self.model = RandomForestClassifier(n_estimators=self.n_estimators)
     X_train = self.features_frame[self.features_frame['id'].isin(self.train['id'])]
     del X_train['id']
     y_train = self.train['label'].values.ravel() 
     self.model.fit(X_train, y_train)
     X_test = self.features_frame[~self.features_frame['id'].isin(self.train['id'])]
     if ((self.current_iteration <= self.iterations) and (len(X_test) > self.min_test_size)):
         entropies = pd.DataFrame()
         entropies['id'] = X_test['id']
         del X_test['id']
         probabilities = self.model.predict_proba(X_test)
         entropies['prob_0'] = probabilities[:,0]
         entropies['prob_1'] = probabilities[:,1]
         entropies['entropy'] = entropies.apply(lambda en: calculate_entropy(en.get("prob_0").item(), en.get("prob_1").item()), axis=1)
         new_examples = pd.DataFrame()
         new_examples[['id', 'l_id', 'r_id']] = self.block_frame[self.block_frame["id"].isin(entropies.sort_values("entropy", ascending=False).head(self.batch_size)["id"])][['id', 'l_id', 'r_id']]
         self.create_labeling_task(new_examples)
     else:
         client = py_cdrive_api.Client(access_token=self.access_token)
         try:
             client.delete(self.output_dir + '/apply-model')
         except py_cdrive_api.ForbiddenAccessException as e:
             pass
         client.create_folder(self.output_dir, 'apply-model')
         self.save_model(self.output_dir + '/learner')
         self.apply_model(self.output_dir + '/apply-model')
         sm_job.status = "Apply Model"
         sm_job.status = "Complete"
         sm_job.long_status = "Training complete. Model applied to blocking output. Matches saved to " + self.output_dir + '/apply-model/matches.csv'
         sm_job.save()
Esempio n. 10
0
 def fake_label(self, task_name):
     client = py_cdrive_api.Client(access_token=self.access_token)
     examples = pd.read_csv(client.file_url(self.output_dir + '/learner/' + task_name + '.csv'))
     if self.current_iteration == 0:
         examples['label'] = 'No'
     else:
         examples = self.block_frame[self.block_frame['id'].isin(examples['id'])]
         index = self.gold.set_index(list(self.gold.columns)).index
         examples.set_index(list(self.gold.columns), inplace=True)
         mask1 = examples.index.isin(index)
         inverted_columns = list(map(lambda x: 'r_' + x[2:] if x.startswith('l_') else 'l_' + x[2:], list(self.gold.columns)))
         examples.reset_index(inplace=True)
         examples.set_index(inverted_columns, inplace=True)
         mask2 = examples.index.isin(index)
         examples['label'] = mask1 | mask2
         examples['label'] = examples['label'].map({True: 'Yes', False: 'No'})
         examples.reset_index(drop=True, inplace=True)
     file_name = task_name + '-labeled.csv'
     file_path = settings.DATA_PATH + '/' + self.uid + '/' + file_name
     examples.to_csv(file_path, index=False)
     client.upload(file_path, self.output_dir + '/learner')
     self.complete_iteration()
Esempio n. 11
0
    def create_labeling_task(self, examples):
        examples = examples.astype(int)
        task_name = 'iteration-' + str(self.current_iteration) + '-' + self.uid
        file_name = task_name + '.csv'
        file_path = settings.DATA_PATH + '/' + self.uid + '/' + file_name
        examples.to_csv(file_path, index=False)
        client = py_cdrive_api.Client(access_token=self.access_token)
        client.upload(file_path, self.output_dir + '/learner')
        if self.gold_path is None:
            data = {
                'retId': self.uid,
                'taskName': task_name, 
                'template': 'EMD',
                'dataPath': self.output_dir + '/learner/truncated-profiles.csv',
                'examplesPath': self.output_dir + '/learner/' + file_name,
                'labelOptionsPath': self.output_dir + '/learner/options.json',
                'completionUrl': 'http://lynx-' + os.environ['COLUMBUS_USERNAME'] + '/api/complete-iteration/',
                'outputPath': self.output_dir + '/learner',
                'outputName': task_name + '-labeled.csv'
            }
            res = requests.post('http://labeler-' + os.environ['COLUMBUS_USERNAME'] + '/api/create-task', data=json.dumps(data), headers={'Authorization': self.auth_header, 'content-type': 'application/json'}) 

            sm_job = SMJob.objects.filter(uid=self.uid)[0]
            sm_job.stage = 'Active Learning'
            sm_job.status = 'Ready'
            sm_job.labeling_url = os.environ['CDRIVE_URL'] + 'app/' + os.environ['COLUMBUS_USERNAME'] + '/labeler/example/' + task_name
            long_status = ""
            if self.current_iteration == 0:
                sm_job.long_status = "Labeling seed examples"
            elif self.current_iteration == 1:
                sm_job.long_status = 'Finished labeling seed examples. Labeling examples for iteration 1/' + str(self.iterations) + '.'
            else:
                sm_job.long_status = 'Finished labeling examples for iteration ' + str(self.current_iteration - 1) + '/' + str(self.iterations) + '. Labeling examples for iteration ' + str(self.current_iteration) + '/' + str(self.iterations) + '.'
            sm_job.save()
        else:
            self.fake_label(task_name)
Esempio n. 12
0
    def get(self, request):
        auth_header = request.META['HTTP_AUTHORIZATION']
        token = auth_header.split()[1]
        client = None
        try:
            client = py_cdrive_api.Client(access_token=token)
            parent_details = client.list_detailed(
                'users/' + os.environ['COLUMBUS_USERNAME'] + '/apps/lynx')
            if (parent_details['permission'] != 'Edit'):
                return Response(status=status.HTTP_403_FORBIDDEN)
        except py_cdrive_api.UnauthorizedAccessException as e:
            return Response(status=status.HTTP_401_UNAUTHORIZED)
        except py_cdrive_api.ForbiddenAccessException as e:
            return Response(status=status.HTTP_403_FORBIDDEN)

        config_url = None
        try:
            config_url = client.file_url('users/' +
                                         os.environ['COLUMBUS_USERNAME'] +
                                         '/apps/lynx/default_config.json')
        except py_cdrive_api.ForbiddenAccessException:
            return Response({}, status=status.HTTP_200_OK)
        response = requests.get(config_url)
        return Response(json.loads(response.text), status=status.HTTP_200_OK)
Esempio n. 13
0
 def save_model(self, path):
     file_name = 'iteration-' + str(self.current_iteration) + '-model.joblib'
     joblib.dump(self.model, settings.DATA_PATH + '/' + self.uid + '/' + file_name) 
     client = py_cdrive_api.Client(access_token=self.access_token)
     client.upload(settings.DATA_PATH + '/' + self.uid + '/' + file_name, path)
Esempio n. 14
0
import requests, json, py_cdrive_api, time, argparse

try:
    client = py_cdrive_api.Client(domain='col4infa.io')
except Exception as e:
    print(e)

parser = argparse.ArgumentParser(description='Lynx Simulator')
parser.add_argument('-i', '--input-dir', help='Path to profiler input')
parser.add_argument('-o', '--output-dir', help='Path to lynx output')
parser.add_argument('-g', '--gold', help='Path to gold labels')
parser.add_argument('-c', '--config', help='Path to config')

options = parser.parse_args()

if options.input_dir is None:
    raise Exception("Please pass input directory with -i flag")
if options.output_dir is None:
    raise Exception("Please pass output directory with -o flag")
if options.gold is None:
    raise Exception("Please pass gold labels with -g flag")
input_dir = options.input_dir
output_dir = options.output_dir
gold = options.gold

# Initialize py_cdrive_api client
client = py_cdrive_api.Client(domain='col4infa.io')

# Get Config from default_config.json
config_path = 'users/' + client.username + '/apps/lynx/default_config.json' if options.config is None else options.config
config_url = client.file_url(config_path)