def main(sts_train_file, sts_dev_file):
    """Fits a logistic regression for paraphrase identification, using string similarity metrics as features.
    Prints accuracy on held-out data. Data is formatted as in the STS benchmark"""

    min_paraphrase = 4.0
    max_nonparaphrase = 3.0

    # loading train
    train_texts_sts, train_y_sts = parse_sts(sts_train_file)

    # loading dev
    dev_texts_sts, dev_y_sts = parse_sts(sts_dev_file)
Esempio n. 2
0
def main(sts_train_file, sts_dev_file, w2v_file):
    """Fits a logistic regression for paraphrase identification, using string similarity metrics and vector similarity
    as features. Prints results on held-out data. Data is formatted as in the STS benchmark"""

    min_paraphrase = 4.0
    max_nonparaphrase = 3.0

    # TODO 1: Load data partitions and convert to paraphrase dataset as in the lab
    # You will train a logistic regression on the TRAIN partition
    train_texts_sts, train_y_sts = parse_sts(sts_train_file)

    # You will evaluate predictions on the VALIDATION partition
    dev_texts_sts, dev_y_sts = parse_sts(sts_dev_file)
def main(sts_data):
    """Calculate NIST metric for pairs of strings
    Data is formatted as in the STS benchmark"""

    # TODO 1: define a function to read the data in util
    texts, labels = parse_sts(sts_data)

    print(f"Found {len(texts)} STS pairs")

    # take a sample of sentences so the code runs fast for faster debugging
    # when you're done debugging, you may want to run this on more!
    sample_text = texts[120:140]
    sample_labels = labels[120:140]
    # zip them together to make tuples of text associated with labels
    sample_data = zip(sample_labels, sample_text)

    scores = []
    for label, text in enumerate(sample_data):
        t1, t2 = text
        print(f"Sentences: {t1}\t{t2}")

        # TODO 2: Calculate NIST for each pair of sentences
        # calculate NIST(a,b) and NIST(b,a) and
        # catch any exceptions and assign 0.0 for that part of the score
        nist_score = 0.0

        print(f"Label: {label}, NIST: {nist_score:0.02f}\n")
        scores.append(score)
def main(sts_data):

    texts, labels = parse_sts(sts_data)

    # TODO 1: get a single list of texts to determine vocabulary and document frequency
    # create a TfidfVectorizer
    # fit to the training data

    # TODO 2: Can normalization like removing stopwords remove differences that aren't meaningful?
    # fill in preprocess_text above
    preproc_train_texts = [preprocess_text(text) for text in texts]

    # TODO 3: Learn another TfidfVectorizer for preprocessed data
    # Use token_pattern "\S+" in the TfidfVectorizer to split on spaces

    # TODO 4: compute cosine similarity for each pair of sentences, both with and without preprocessing
    cos_sims = []
    cos_sims_preproc = []

    for pair in texts:
        t1, t2 = pair

    # TODO 5: measure the correlations
    pearson = 0.0
    preproc_pearson = 0.0
    print(f"default settings: r={pearson:.03}")
    print(f"preprocessed text: r={preproc_pearson:.03}")
Esempio n. 5
0
def main(sts_dev, w2v_file):
    # load the texts
    dev_texts, dev_y = parse_sts(sts_dev)

    # load word2vec using gensim KeyedVectors object
    w2v_vecs = None

    # get cosine similarities of every pair in dev
    # if either sentence is completely out of vocabulary, record "0" as the similarity
    cos_sims_mean = []
    cos_sims_product = []

    pearson_mean = 0
    print(f"word2vec mean pearsons: r={pearson_mean[0]:.03}")

    pearson_prod = 0
    print(f"word2vec product pearsons: r={pearson_prod[0]:.03}")
Esempio n. 6
0
def main(sts_data):
    """Calculate NIST metric for pairs of strings
    Data is formatted as in the STS benchmark"""

    # read the dataset
    texts, labels = parse_sts(sts_data)


    print(f"Found {len(texts)} STS pairs")
    
    for i,pair in enumerate(texts[120:140]):
        label = labels[i+120]
        t1, t2 = pair
        print(f"Sentences: {t1}\t{t2}")

        # TODO: Calculate for each pair of sentences
        # catch any exceptions and assign 0.0

        nist_score = 0.0
        print(f"Label: {label}, NIST: {nist_score:0.02f}\n")
Esempio n. 7
0
def main(sts_data):
    """Transform a semantic textual similarity dataset into a paraphrase identification.
    Data is formatted as in the STS benchmark"""

    max_nonparaphrase = 3.0
    min_paraphrase = 4.0

    # read the dataset
    texts, labels = parse_sts(sts_data)
    labels = np.asarray(labels)

    pi_texts, pi_labels = sts_to_pi(texts, labels)

    # calculate to check your split agrees with mine
    num_nonparaphrase = 0
    num_paraphrase = 0
    # 957 for dev
    print(f"{num_nonparaphrase} non-paraphrase")
    # 264 for dev
    print(f"{num_paraphrase} paraphrase")

    # Instantiate a TFIDFVectorizer to create representations for sentences
    # compute cosine similarity for each pair of sentences
    # use a threshold of 0.7 to convert each similarity score into a paraphrase prediction
    cos_sims_preproc = []

    predictions = np.asarray(cos_sims_preproc) > 0.7

    # calculate and print precision and recall statistics for your system
    num_pred = 0
    print(f"Number predicted paraphrase: {num_pred}")

    num_pos = 0
    print(f"Number positive: {num_pos}")

    num_true_pos = 0
    print(f"Number true positive: {num_true_pos}")

    precision = 0
    recall = 0
    print(f"Scores: precision {precision:0.03}\trecall {recall:0.03}")
def main(sts_data):
    """Calculate pearson correlation between semantic similarity scores and string similarity metrics.
    Data is formatted as in the STS benchmark"""

    # TODO 1: read the dataset; implement in util.py
    texts, labels = parse_sts(sts_data)

    print(f"Found {len(texts)} STS pairs")

    # TODO 2: Calculate the metrics here
    score_types = [
        "NIST", "BLEU", "Word Error Rate", "Longest common substring",
        "Edit Distance"
    ]

    # Sample code to print results. You can alter the printing as you see fit. It is most important to put the results
    # in a table in the README
    print(f"Semantic textual similarity for {sts_data}\n")
    for metric_name in score_types:
        score = 0.0
        print(f"{metric_name} correlation: {score:.03f}")
def main(sts_data):
    """Calculate pearson correlation between semantic similarity scores and string similarity metrics.
    Data is formatted as in the STS benchmark"""

    # read the dataset
    # TODO: implement in util.py
    texts, labels = parse_sts(sts_data)

    print(f"Found {len(texts)} STS pairs")

    score_types = ["NIST", "BLEU", "Word Error Rate", "Longest common substring", "Levenshtein distance"]
    scores = {score_type: [] for score_type in score_types}

    # TODO: Calculate the metrics here to fill the lists in scores


    # This can stay as-is to print similar output to the sample
    print(f"Semantic textual similarity for {sts_data}\n")
    for metric_name, dists in scores.items():
        score, sig = pearsonr(dists, labels)
        print(f"{metric_name} correlation: {score:.03}")
def main(sts_dev, w2v_file):

    # TODO 1: load the texts
    dev_texts, dev_y = parse_sts(sts_dev)

    # TODO 2: load word2vec using gensim KeyedVectors object
    # WARNING: you may need to downgrade gensim to version 3.4
    w2v_vecs = None

    # TODO 3: Define the functions above that compose word representations into sentence representations

    # TODO 4: get cosine similarities of every sentence pair in dev
    # if either sentence is completely out of vocabulary, record "0" as the similarity for the pair
    cos_sims_mean = []
    cos_sims_product = []

    # TODO 5: Measure correlation with STS labels for the two ways of computing word2vec sentence representations
    pearson_mean = 0
    print(f"word2vec mean pearsons: r={pearson_mean[0]:.03}")

    pearson_prod = 0
    print(f"word2vec product pearsons: r={pearson_prod[0]:.03}")