def compare():
    """Handle requests for /compare via POST"""

    # Read files
    if not request.files["file1"] or not request.files["file2"]:
        abort(400, "missing file")
    try:
        file1 = request.files["file1"].read().decode("utf-8")
        file2 = request.files["file2"].read().decode("utf-8")
    except Exception:
        abort(400, "invalid file")

    # Compare files
    if not request.form.get("algorithm"):
        abort(400, "missing algorithm")
    elif request.form.get("algorithm") == "lines":
        regexes = [f"^{re.escape(match)}$" for match in lines(file1, file2)]
    elif request.form.get("algorithm") == "sentences":
        regexes = [re.escape(match) for match in sentences(file1, file2)]
    elif request.form.get("algorithm") == "substrings":
        if not request.form.get("length"):
            abort(400, "missing length")
        elif not int(request.form.get("length")) > 0:
            abort(400, "invalid length")
        regexes = [re.escape(match) for match in substrings(
            file1, file2, int(request.form.get("length")))]
    else:
        abort(400, "invalid algorithm")

    # Highlight files
    highlights1 = highlight(file1, regexes)
    highlights2 = highlight(file2, regexes)

    # Output comparison
    return render_template("compare.html", file1=highlights1, file2=highlights2)
def compare():
    """Handle requests for /compare via POST"""

    # Read files
    if not request.files["file1"] or not request.files["file2"]:
        abort(400, "missing file")
    try:
        file1 = request.files["file1"].read().decode("utf-8")
        file2 = request.files["file2"].read().decode("utf-8")
    except Exception:
        abort(400, "invalid file")

    # Compare files
    if not request.form.get("algorithm"):
        abort(400, "missing algorithm")
    elif request.form.get("algorithm") == "lines":
        regexes = [f"^{re.escape(match)}$" for match in lines(file1, file2)]
    elif request.form.get("algorithm") == "sentences":
        regexes = [re.escape(match) for match in sentences(file1, file2)]
    elif request.form.get("algorithm") == "substrings":
        if not request.form.get("length"):
            abort(400, "missing length")
        elif not int(request.form.get("length")) > 0:
            abort(400, "invalid length")
        regexes = [re.escape(match) for match in substrings(
            file1, file2, int(request.form.get("length")))]
    else:
        abort(400, "invalid algorithm")

    # Highlight files
    highlights1 = highlight(file1, regexes)
    highlights2 = highlight(file2, regexes)

    # Output comparison
    return render_template("compare.html", file1=highlights1, file2=highlights2)
def rank_sentences(doc, tfidf_dict, include_words=False):
    """given document and the document tfidf_dict {word:word_tfidfscore}, return list of
    ranked senteces and its score [(sent,score),(sen2,score2)]"""

    ranked_sentences = []

    for sentence in helpers.sentences(doc):

        word_list = helpers.clean(sentence)
        if len(word_list) < 5:
            continue  #if less than 5 words in sentence, skip
        score = sum([tfidf_dict[word[0]] for word in word_list])

        ranked_sentences.append((sentence, score))

    return ranked_sentences
Exemple #4
0
def main():

    # Parse command-line arguments
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--lines", action="store_true", help="compare lines")
    group.add_argument("--sentences",
                       action="store_true",
                       help="compare sentences")
    group.add_argument("--substrings",
                       metavar="N",
                       type=positive,
                       help="compare substrings of length N")
    parser.add_argument("FILE1", help="file to compare")
    parser.add_argument("FILE2", help="file to compare")
    args = vars(parser.parse_args())

    # Read files
    try:
        with open(args["FILE1"], "r") as file:
            file1 = file.read()
    except IOError:
        sys.exit(f"Could not read {args['FILE1']}")
    try:
        with open(args["FILE2"], "r") as file:
            file2 = file.read()
    except IOError:
        sys.exit(f"Could not read {args['FILE2']}")

    # Compare files
    if args["lines"]:
        matches = lines(file1, file2)
    elif args["sentences"]:
        matches = sentences(file1, file2)
    elif args["substrings"]:
        matches = substrings(file1, file2, args["substrings"])

    # Output matches, sorted from longest to shortest, with line endings escaped
    for match in sorted(matches, key=len, reverse=True):
        print(match.replace("\n", "\\n").replace("\r", "\\r"))
Exemple #5
0
#!/usr/bin/env python3

import sys, re, json, fileinput, glob

from helpers import sentences

REVIEWSDIR='UD_English/not-to-release/sources/reviews'

CONLLULEX=sys.argv[1]

# load UD data

ud = {}
udDocs = glob.glob(f'{REVIEWSDIR}/*.xml.conllu')
for udDoc in udDocs:
    for sent in sentences(udDoc):
        ud[sent.meta_dict['sent_id']] = (udDoc, sent)

nSentsChanged = nToksChanged = nTagsChanged = nLemmasChanged = nDepsChanged = 0
for sent in sentences(CONLLULEX):
    # metadata shouldn't change (assume tokenization hasn't changed)
    print(*sent.meta, sep='\n')
    newudDoc, newudsent = ud[sent.meta_dict['sent_id']]
    assert len(sent.tokens)==len(newudsent.tokens)
    sentChanged = False
    for tok,newudtok in zip(sent.tokens,newudsent.tokens):
        oldud = '\t'.join(tok.orig.split('\t')[:10])
        newud = '\t'.join(newudtok.orig.split('\t')[:10])
        if oldud!=newud:
            nToksChanged += 1
            sentChanged = True