def copy(self, ids): file_utils.make_directory(self.reports_directory) ids_array = ids.keys() # ids_array = [ids_array[0]] self.data_utils.batch_fetch_docs_for_ids(base_url=self.src_server, ids=ids_array, index=self.src_index, type=self.src_type, docs_fetched=self.docs_fetched, batch_size=500)
def get_doc_ids(server, src_index, src_type, dest_dir, dest_file_name, query=None): documents_ids = file_utils.load_file(dest_dir, dest_file_name) if len(documents_ids) == 0: documents_ids = export_doc_ids(server, src_index, src_type, query) print __name__, 'Saving to', dest_dir, dest_file_name file_utils.make_directory(dest_dir) file_utils.save_file(dest_dir, dest_file_name, documents_ids) return documents_ids
def dump_to_file(self): logs_directory = self.log_files_directory + '/' + self.data_source_name file_utils.make_directory(logs_directory)
region = system_variables[8] if sys.platform == 'linux2': # set constant variables ec2 output_dir = '/home/ec2-user/output/' input_dir = '/home/ec2-user/input/' processed_dir = '/home/ec2-user/processed/' else: # set constant variables local output_dir = 'processing/output/' input_dir = 'processing/input/' processed_dir = 'processing/processed/' # make the directories file_utils.make_directory(output_dir) file_utils.make_directory(input_dir) file_utils.make_directory(processed_dir) # create the s3 utility for the peon to use s3_util = s3_utility.S3Utility(access_key, secret_key) # load system parameters set by master node into local variables to use in init of peon peon = Peon( instance_id, s3_util, input_s3_bucket, output_s3_bucket, processed_s3_bucket, data_group_id, processing_strategy,
filename = "%s/word-assignments.dat" % directory w_assign_file = open(filename, "w") for doc_index in range(0, corpus.num_docs): # Show user script is still processing if (doc_index % 100) == 0: print("final e step document %d" % doc_index) likelihood += util_functions.lda_inference(corpus.doc_list[doc_index], model, var_gamma[doc_index], phi) file_utils.write_word_assignment(w_assign_file, corpus.doc_list[doc_index], phi, model) w_assign_file.close() likelihood_file.close() if __name__ == "__main__": # define global variables if len(sys.argv) == 7: global_att.INITIAL_ALPHA = float(sys.argv[1]) global_att.NTOPICS = int(sys.argv[2]) file_utils.read_settings(sys.argv[3]) corpus = corpus.Corpus(sys.argv[4]) file_utils.make_directory(sys.argv[6]) run_em(sys.argv[5], sys.argv[6], corpus) else: print("usage : lda_estimate [initial alpha] [k] [settings] " "[data] [random/seeded/*] [directory]")