def main():
    parser = argparse.ArgumentParser(
        description='Outputs a human readable model.')
    parser.add_argument('--input',
                        '-i',
                        metavar='FILE',
                        help='The input corpus (in Andrews format).')
    parser.add_argument('--output',
                        '-o',
                        metavar='FILE',
                        help='The output corpus (in Andrews format).')

    args = parser.parse_args()

    input = open(args.input)
    output = open(args.output, 'w')
    for i, line in enumerate(input.readlines()):
        line = line.rstrip()
        output_lst = []
        for w in itersplit(line, ' '):
            if not w.strip(): continue
            if ',' not in w: continue
            output_lst.append(w[w.index(',') + 1:])
        output.write(" ".join(output_lst) + "\n")
        logging.info("Document %d: %d words written." % (i, len(output_lst)))

    input.close()
    output.close()
Exemple #2
0
def main():
    parser = argparse.ArgumentParser(
        description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--input',
                        '-i',
                        metavar='FILE',
                        help='The input corpus (in Andrews format).')
    parser.add_argument('--output',
                        '-o',
                        metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument('--features',
                        '-f',
                        metavar='FILE',
                        help='The (dense) vector space of features.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)
    features = load_features(args.features)
    feature_map = word_ids_to_features(vocab_labels, features)

    logging.info("First pass; gathering statistics.")
    inpt = utfopen(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024 * 1024)
    inpt = utfopen(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" %
                         (lno, numlines, 100 * float(lno) / numlines))
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            idx = chunk.rindex(":")
            wid, cnt = chunk[:idx], chunk[idx + 1:]
            if wid not in feature_map:
                output.write(chunk + ' ')
            else:
                cnt = int(cnt)
                dist = feature_map[wid]
                cnts = Counter(stochastic_choice(dist) for i in xrange(cnt))
                for fid, cnt in cnts.iteritems():
                    output.write('%s,%d:%d ' % (wid, fid, cnt))
        output.write('\n')

    inpt.close()
    output.close()
def main():
    parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--input', '-i', metavar='FILE',
                        help='The input corpus (in Andrews format). Must be multimodal.')
    parser.add_argument('--output', '-o', metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument('--outvocab', '-V', metavar='FILE',
                        help='The output vocab labels; necessary for OOV processing later.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)

    logging.info("First pass; gathering statistics.")
    inpt = open(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    output_labels = {}
    output_labels_file = utfopenwrite(args.outvocab)

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024*1024)
    inpt = open(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines))

        outline = []
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            if ',' not in chunk: continue # strip just words
            idx = chunk.index(',')
            wid = int(chunk[:idx])
            rest = chunk[idx:]

            if wid not in output_labels:
                output_labels[wid] = len(output_labels) + 1
                output_labels_file.write("%d\t" % output_labels[wid])
                output_labels_file.write(vocab_labels[wid])
                output_labels_file.write("\n")
            outline.append(str(output_labels[wid]) + rest)

        if outline:
            output.write(' '.join(outline))
            output.write('\n')

    inpt.close()
    output.close()
def main():
    parser = argparse.ArgumentParser(description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab', '-v', metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument('--input', '-i', metavar='FILE',
                        help='The input corpus (in Andrews format).')
    parser.add_argument('--output', '-o', metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument('--features', '-f', metavar='FILE',
                        help='The (dense) vector space of features.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)
    features = load_features(args.features)
    feature_map = word_ids_to_features(vocab_labels, features)

    logging.info("First pass; gathering statistics.")
    inpt = utfopen(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024*1024)
    inpt = utfopen(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" % (lno, numlines, 100*float(lno)/numlines))
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            idx = chunk.rindex(":")
            wid, cnt = chunk[:idx], chunk[idx+1:]
            if wid not in feature_map:
                output.write(chunk + ' ')
            else:
                cnt = int(cnt)
                dist = feature_map[wid]
                cnts = Counter(stochastic_choice(dist) for i in xrange(cnt))
                for fid, cnt in cnts.iteritems():
                    output.write('%s,%d:%d ' % (wid, fid, cnt))
        output.write('\n')

    inpt.close()
    output.close()
def main():
    parser = argparse.ArgumentParser(description='Outputs a human readable model.')
    parser.add_argument('--input', '-i', metavar='FILE',
                        help='The input corpus (in Andrews format).')
    parser.add_argument('--output', '-o', metavar='FILE',
                        help='The output corpus (in Andrews format).')

    args = parser.parse_args()

    input = open(args.input)
    output = open(args.output, 'w')
    for i, line in enumerate(input.readlines()):
        line = line.rstrip()
        output_lst = []
        for w in itersplit(line, ' '):
            if not w.strip(): continue
            if ',' not in w: continue
            output_lst.append(w[w.index(',')+1:])
        output.write(" ".join(output_lst) + "\n")
        logging.info("Document %d: %d words written." % (i, len(output_lst)))

    input.close()
    output.close()
def main():
    parser = argparse.ArgumentParser(
        description='Stochastically adds features to a corpus.')
    parser.add_argument('--vocab',
                        '-v',
                        metavar='FILE',
                        help='The vocab labels.')
    parser.add_argument(
        '--input',
        '-i',
        metavar='FILE',
        help='The input corpus (in Andrews format). Must be multimodal.')
    parser.add_argument('--output',
                        '-o',
                        metavar='FILE',
                        help='The output corpus (in Andrews format).')
    parser.add_argument(
        '--outvocab',
        '-V',
        metavar='FILE',
        help='The output vocab labels; necessary for OOV processing later.')

    args = parser.parse_args()
    vocab_labels = load_labels(args.vocab)

    logging.info("First pass; gathering statistics.")
    inpt = open(args.input)
    numlines = len(inpt.readlines())
    inpt.close()

    output_labels = {}
    output_labels_file = utfopenwrite(args.outvocab)

    logging.info("Starting second pass; actually writing output.")
    output = open(args.output, 'w', 1024 * 1024)
    inpt = open(args.input)
    for lno, line in enumerate(inpt.readlines(), 1):
        if lno % 1000 == 0:
            logging.info("Processing doc# %d/%d (%4.1f%%)" %
                         (lno, numlines, 100 * float(lno) / numlines))

        outline = []
        for chunk in itersplit(line, ' '):
            chunk = chunk.rstrip()
            if not chunk: continue
            if ',' not in chunk: continue  # strip just words
            idx = chunk.index(',')
            wid = int(chunk[:idx])
            rest = chunk[idx:]

            if wid not in output_labels:
                output_labels[wid] = len(output_labels) + 1
                output_labels_file.write("%d\t" % output_labels[wid])
                output_labels_file.write(vocab_labels[wid])
                output_labels_file.write("\n")
            outline.append(str(output_labels[wid]) + rest)

        if outline:
            output.write(' '.join(outline))
            output.write('\n')

    inpt.close()
    output.close()