def _on_run_sampler(self, e): '''Handles the run sampler button click event ''' try: self.confidence_val = Decimal(self.confidence.GetValue()) / Decimal('100') self.precision_val = float(self.precision.GetValue()) / 100.0 if not os.path.exists(self.dir_path) or not os.path.exists(self.output_dir_path): dlg = wx.MessageDialog(self, "Please enter a valid input/output directory", "Error", wx.ICON_ERROR) dlg.ShowModal() return file_list = find_files_in_folder(self.dir_path) self.SetStatusText('%d files found in %s.' % (len(file_list), self.dir_path) ) sampled_files = random_sampler(file_list, self.confidence_val, self.precision_val, self.SEED) self.SetStatusText('%d files are sampled out of %d files.' % (len(sampled_files), len(file_list))) copy_files_with_dir_tree(self.dir_path, sampled_files, self.output_dir_path) self.SetStatusText('%d randomly sampled files (from %d files) are copied to the output folder.' % (len(sampled_files), len(file_list))) # shows the tree list control self.process_files_tree.on_changed_output_dir(self.output_folder_control.GetValue()) self.process_files_tree.Show(True) self.GetSizer().Layout() self.Refresh() except Exception as anyException: dlg = wx.MessageDialog(self, str(anyException), "Error", wx.ICON_ERROR) dlg.ShowModal()
def main(): timestamp = datetime.datetime.now() arg_parser = argparse.ArgumentParser('Random sample test function:') arg_parser.add_argument("-d", dest="input_folder", type=str, help="the root directory for all the mails", required=True) arg_parser.add_argument("-c", dest="confidence", type=float, help="The confidence interval eg. 0.95 for 95%", required=True) arg_parser.add_argument("-p", dest="precision", type=float, help="The precision for the interval eg. 0.02 precision for\ .95 confidence gives 95% +/-2% error", required=True) arg_parser.add_argument("-o", dest="output_dir", type=str, help="Output directory of samples", default="/home/abhiramj/code/temp/samples", required=False) args = arg_parser.parse_args() if not os.path.isdir(args.output_dir): logger.debug("Making output directory" + args.output_dir) os.makedirs(args.output_dir) file_handle = logging.FileHandler(os.path.join(args.output_dir,'random_sampler_test_function--' +str(timestamp)+'.log')) file_handle.setLevel(logging.INFO) file_handle.setFormatter(formatter) logger.addHandler(file_handle) logger.info("Args are: ") logger.info("input_folder: "+ args.input_folder) logger.info("confidence: "+ str(args.confidence)) logger.info("precision: "+ str(args.precision)) logger.info("output_dir: "+ args.output_dir) if not os.path.exists(args.input_folder) : logger.error("Exiting with error: Input folder cannot be found") raise Exception, "Input folder cannot be found" if args.confidence <= 0 or args.confidence > 1: logger.error("Exiting with error: Confidence is not valid, enter as a probability between 0 and 1") raise Exception, "Confidence is not valid, enter as a probability between 0 and 1" if args.precision <= 0 or args.precision > 1: logger.error( "Exiting with error: Precision is not valid, enter as a probability between 0 and 1") raise Exception, "Precision is not valid, enter as a probability between 0 and 1" file_list = find_files_in_folder(args.input_folder) message_random_sample = random_sampler(file_list,args.confidence,args.precision,SEEDCONSTANT=0.5) file_destination_dir = args.output_dir +"--"+ str(timestamp) copy_random_files(file_destination_dir,message_random_sample) return message_random_sample
def __initialize(self): for sex in ("atp", "wta"): dirname = os.path.join(cfg_dir.stat_players_dir(sex), "decided_set") for filename in fu.find_files_in_folder(dirname, filemask="*", recursive=False): subname = os.path.basename(filename).replace(".txt", "") # 'decided' dct = dict_tools.load( filename, createfun=lambda: defaultdict(rl.SizedValue), keyfun=int, valuefun=rl.SizedValue.create_from_text, ) for plrid, val in dct.items(): self.val_from_sexnameplr[(sex, subname, plrid)] = val
def lucene_index(input_folder,output_folder): ''' Indexes fresh text data using lucene 3.6. Doesn't support incremental generation of index as of now. Currently crashes on neo by running out of heap space. Arguments: Input folder for text files. output folder for index location Returns: void. The index is stored if generated. ''' # Setting up log file logging.basicConfig(file=os.path.join(output_folder,"lucene_index.log")) logging.info("Input directory for logging: "+input_folder) logging.info("Output directory of index: "+output_folder) if not os.path.isdir(output_folder): logger.debug("Making output directory for index: "+ output_folder) os.makedirs(output_folder) # Setting up lucene's heap size for index and version of indexer lucene.initVM(initialheap='1024m',maxheap='2048m') index_folder = SimpleFSDirectory(File(output_folder)) analyzer = StandardAnalyzer(Version.LUCENE_30) writer = IndexWriter(index_folder, analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) # Optimization to reduce heap space usage for generation of index. Merges buffer with # current index after 15 docs. writer.setMergeFactor(15) writer.setRAMBufferSizeMB(32.0) # Search to find the files to index files_to_index = find_files_in_folder(input_folder) for input_file in files_to_index: doc = Document() content = open(input_file, 'r').read() doc.add(Field("text", content, Field.Store.NO, Field.Index.ANALYZED)) # Do not store text.Only index. doc.add(Field("path", input_file, Field.Store.YES, Field.Index.NO)) # Store path to assist in retreiving the file writer.addDocument(doc) # Index logger.info("Indexed lines from " +input_folder+" (%d documents in index)" % (writer.numDocs())) logger.info( "About to optimize index of %d documents..." % writer.numDocs()) writer.optimize() # Compress index logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() logger.info("Closed index")
def _last_file_date(sex: str, surface="all"): def filename_mask(): return "{}_{}_elo_*.json".format(sex, str(surface)) dates = [] for filename in fu.find_files_in_folder(cfg_dir.ratings_dir(), filemask=filename_mask(), recursive=False): if len(filename) >= 15: end_idx = -len(".json") beg_idx = end_idx - len("yyyy.mm.dd") yyyy_mm_dd = filename[beg_idx:end_idx] year = int(yyyy_mm_dd[0:4]) month = int(yyyy_mm_dd[5:7]) day = int(yyyy_mm_dd[8:10]) dates.append(datetime.date(year, month, day)) if dates: return max(dates)
def do_load(self, dialog): self.file_list = find_files_in_folder(self.dir_path) self.Refresh() dialog.Close()
def __init__(self, parent): ''' Constructor ''' # Some value needs to initialized for this to run without exception self.confidence_val = DEFAULT_CONFIDENCE_LEVEL / Decimal('100') self.precision_val = DEFAULT_CONFIDENCE_INTERVAL # Calls the parent class's method super(RandomSampler, self).__init__(parent) # stack to store files and tags self.file_tag_dict = {} ''' This is the tag format self.file_tag_dict(filename) = [('Reviewed','True'),('Accept', 'False'),('A1','False'),('A2','True')] ''' # initialize the default list of tags and the current tag self.DEFAULT_TAGS_NUMBER = 2 self.REVIEWED_TAG_INDEX = 0 self.ACCEPT_TAG_INDEX = 1 self.current_file_selected = None self.default_tag = ('Default' , 'True') self.current_tag_list = self.make_default_tag_list() self._tag_list.ClearAll() self._tag_list.InsertColumn(0,'Tag') self._tag_list.InsertColumn(1,'Status') self._tag_list.InsertStringItem(self.REVIEWED_TAG_INDEX, 'Reviewed') self._tag_list.SetStringItem(self.REVIEWED_TAG_INDEX, 1, 'False') self._tag_list.InsertStringItem(self.ACCEPT_TAG_INDEX, 'Accept') self._tag_list.SetStringItem(self.ACCEPT_TAG_INDEX, 1, 'False') # Separator for splitting tags self.TAG_NAME_SEPARATOR = " , " self.TAG_PREFIX = 'tag :' # Maximum depth of folders expanded for display self.MAX_FOLDER_DEPTH = 2 self._st_num_samples.Hide() self.dir_path = tempfile.gettempdir() # a cross-platform way of getting the path to the temp directory self.output_dir_path = tempfile.gettempdir() self.from_copy_files_dir = self.dir_path self.to_copy_files_dir = self.output_dir_path # for the I/O tab self._tc_data_dir.SetValue(self.dir_path) self._tc_output_dir.SetValue(self.output_dir_path) self._tc_out_data_dir.SetValue(self.dir_path) self._tc_out_output_dir.SetValue(self.output_dir_path) self.file_list = find_files_in_folder(self.dir_path) self._st_num_data_dir_files.SetLabel('%d files found' % len(self.file_list)) self._st_out_num_data_dir_files.SetLabel('%d files found' % len(self.file_list)) # Defaults for random sample calcluation self.SEED = 2013 self._set_confidence_level_and_interval() self.confidence_val = Decimal(self._cbx_confidence_levels.GetValue()) / Decimal('100') self.get_precision_as_float() self.Bind(wx.EVT_COMMAND_FIND_REPLACE_ALL, self._on_load_tag_list) self._panel_samples.Show(False) # make the tree list control invisible # Icon defaults self.icon_size = (16,16) self.image_list = wx.ImageList(self.icon_size[0], self.icon_size[1]) self._tc_results.SetImageList(self.image_list) self.folder_icon = self.image_list.Add(wx.ArtProvider_GetBitmap(wx.ART_FOLDER, wx.ART_OTHER, self.icon_size)) self.folder_open_icon = self.image_list.Add(wx.ArtProvider_GetBitmap(wx.ART_FILE_OPEN, wx.ART_OTHER, self.icon_size)) self.file_icon = self.image_list.Add(wx.ArtProvider_GetBitmap(wx.ART_NORMAL_FILE, wx.ART_OTHER, self.icon_size)) self._current_page = 0 self.nb_config_sampler.ChangeSelection(self._current_page) self.Center() self.Show(True)
if search_algorithm == 'LDA': None # Process the query # responsive_docs, non_responsive_docs = process_query(query, dictionary, lda, index, doc_paths, limit) # nrd = np.array(non_responsive_docs) # nrd_paths = [os.path.join(dir_path, nrd[idx,2]) for idx, dir_path in enumerate(nrd[:,1])] # looks like i'm not getting full file paths elif search_algorithm == 'Lucene': # None responsive_docs = lucene_search(lucene_index_file, limit, query) non_responsive_docs = [] for file_name in find_files_in_folder(DATA_PATH): if os.path.dirname(file_name) is not lucene_index_file: # skipping index directory if file_name not in responsive_docs: non_responsive_docs.append(file_name) nrd_paths=non_responsive_docs print 'Number of responsive documents:', len(responsive_docs) print 'Number of non responsive documents:', len(non_responsive_docs) print 'The responsive files are: ' for f in responsive_docs: print f # ## Enter confidence intervals to get samples #