def main(argv=None): ''' Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str ''' # Get command line arguments parser = argparse.ArgumentParser(description="Combine MegaM files that \ contain features for the same\ files.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('megam_file', help='MegaM input file(s). Each feature line must be \ preceded by a comment with the filename/ID that \ the features should be joined on.', nargs='+') parser.add_argument('-b', '--binary', help='Converts all of the features in the specified \ range of files to presence/absence binary \ features. Files are numbered starting from 1, and\ if 0 is specified with this flag, all files are\ converted.', type=parse_num_list) parser.add_argument('--doubleup', help='Keep both the binary and numeric versions of any\ feature you convert to binary.', action='store_true') parser.add_argument('-c', '--common', help='Only output features for filenames that are \ common to all MegaM files.', action='store_true') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) logger = logging.getLogger(__name__) # Map from filenames to feature strings feature_dict = OrderedDict() class_dict = {} filename_set = set() # Set that will contain all of the features seen in previous files # (for duplicate detection) prev_feature_set = set() # Iterate through MegaM files for file_num, infile in enumerate(args.megam_file, start=1): # Initialize duplicate feature book-keeping variables curr_feature_set = set() # Initialize set for storing filenames mentioned in current MegaM file curr_filename_set = set() # Handle current MegaM file for curr_filename, class_name, feature_dict in _MegaMDictIter(infile): if curr_filename in class_dict: if class_dict[curr_filename] != class_name: raise ValueError(("Inconsisten class label for instance " + "{} in {}.").format(curr_filename, infile.name)) else: class_dict[curr_filename] = class_name # If there are non-zero features, process them if feature_dict: for feat_name, feat_val in iteritems(feature_dict): # Handle duplicate features feat_name = get_unique_name(feat_name, prev_feature_set, infile.name) # Ignore zero-valued features try: if feat_val != 'N/A' and float(feat_val) != 0: # Convert feature to binary if necessary if (args.binary and ((args.binary == [0]) or (file_num in args.binary))): if args.doubleup: new_feat_pair = '{} {} '.format(feat_name, feat_val) feature_dict[curr_filename] = new_feat_pair if curr_filename not in feature_dict else feature_dict[curr_filename] + new_feat_pair curr_feature_set.add(feat_name) feat_name = get_unique_name(feat_name + "_binary", prev_feature_set, infile.name) feat_val = 1 # Add feature pair to current string of features new_feat_pair = '{} {} '.format(feat_name, feat_val) feature_dict[curr_filename] = new_feat_pair if curr_filename not in feature_dict else feature_dict[curr_filename] + new_feat_pair curr_feature_set.add(feat_name) except ValueError: raise ValueError(("Invalid feature value in feature " + "pair '{} {}' for file {}").format(feat_name, feat_val, curr_filename).encode('utf-8')) # Otherwise warn about lack of features (although that really # just means all of them have zero values) else: if curr_filename not in feature_dict: feature_dict[curr_filename] = "" logger.warning(("No features found for {} in {}. All are " + "assumed to be zero.").format(curr_filename, infile.name).encode('utf-8')) # Add current file's features to set of seen features prev_feature_set.update(curr_feature_set) # Either intersect or union current file's filenames with existing ones if args.common and filename_set: filename_set.intersection_update(curr_filename_set) else: filename_set.update(curr_filename_set) # Print new MegaM file for curr_filename in feature_dict.viewkeys(): # Skip files that aren't common when args.common is true if curr_filename not in filename_set: continue print("# {}".format(curr_filename).encode('utf-8')) print("{}\t{}".format(class_dict[curr_filename], feature_dict[curr_filename].strip()).encode('utf-8'))
def main(argv=None): """ Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str """ # Get command line arguments parser = argparse.ArgumentParser( description="Filter MegaM file to remove\ features with names in stop\ word list (or non alphabetic\ characters). Also has \ side-effect of removing TEST,\ TRAIN, and DEV lines if they\ are present.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("infile", help="MegaM input file", default="-", nargs="?") parser.add_argument("stopwordlist", help="Stop word file", type=argparse.FileType("r")) parser.add_argument("-i", "--ignorecase", help="Do case insensitive feature name matching.", action="store_true") parser.add_argument( "-k", "--keep", help="Instead of removing features with names in the\ list, keep only those.", action="store_true", ) parser.add_argument("--version", action="version", version="%(prog)s {0}".format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s")) if args.infile.isatty(): print( "You are running this script interactively. Press CTRL-D at " + "the start of a blank line to signal the end of your input. " + "For help, run it with --help\n", file=sys.stderr, ) # Read stop word list if args.ignorecase: stopwords = {w.strip().lower() for w in args.stopwordlist} else: stopwords = {w.strip() for w in args.stopwordlist} # Iterate through MegaM file for example_id, class_name, feature_dict in _MegaMDictIter(args.infile): if example_id is not None: print("# {}".format(example_id)) print(class_name, end="\t") first = True for feature, value in iteritems(feature_dict): feature = feature.strip() if re.match(r"[\w-]*$", feature) and ( ( not args.keep and ((feature not in stopwords) or (args.ignorecase and (feature.lower() not in stopwords))) ) or (args.keep and ((feature in stopwords) or (args.ignorecase and (feature.lower() in stopwords)))) ): if first: first = False else: print(" ", end="") print("{} {}".format(feature, value), end="") print()
def main(argv=None): ''' Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str ''' # Get command line arguments parser = argparse.ArgumentParser( description="Filter MegaM file to remove\ features with names in stop\ word list (or non alphabetic\ characters). Also has \ side-effect of removing TEST,\ TRAIN, and DEV lines if they\ are present.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='MegaM input file', default='-', nargs='?') parser.add_argument('stopwordlist', help='Stop word file', type=argparse.FileType('r')) parser.add_argument('-i', '--ignorecase', help='Do case insensitive feature name matching.', action='store_true') parser.add_argument('-k', '--keep', help='Instead of removing features with names in the\ list, keep only those.', action='store_true') parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s')) if args.infile.isatty(): print("You are running this script interactively. Press CTRL-D at " + "the start of a blank line to signal the end of your input. " + "For help, run it with --help\n", file=sys.stderr) # Read stop word list if args.ignorecase: stopwords = {w.strip().lower() for w in args.stopwordlist} else: stopwords = {w.strip() for w in args.stopwordlist} # Iterate through MegaM file for example_id, class_name, feature_dict in _MegaMDictIter(args.infile): if example_id is not None: print("# {}".format(example_id)) print(class_name, end="\t") first = True for feature, value in iteritems(feature_dict): feature = feature.strip() if (re.match(r'[\w-]*$', feature) and ((not args.keep and ((feature not in stopwords) or (args.ignorecase and (feature.lower() not in stopwords)))) or (args.keep and ((feature in stopwords) or (args.ignorecase and (feature.lower() in stopwords)))))): if first: first = False else: print(" ", end='') print('{} {}'.format(feature, value), end="") print()