Beispiel #1
0
def load_sentences(dataset_path):
    """Loads the dataset as a list of sentences.

    Params:
    - dataset_path (str): Path to dataset

    Returns:
    - sentences (pyspark.rdd.RDD): RDD containing sentences
    """
    records = wordcount.load_records(dataset_path)\
        .map(lambda record: Row(record_id=record[0], **record[1].asDict()))
    wordcount.rdd_show(records, "=====Records=====")
    
    # Filter out sentences shorter than 20 characters. Then,
    # add a unique ID to each record. Then,
    # make the ID the first element in the record, so it can be used as a key
    sentences = records.flatMap(record_to_sentences)\
        .filter(lambda sentence: len(sentence['Sentences_t']) > 19)\
        .zipWithUniqueId()\
        .map(lambda record: (record[1], record[0]))
    wordcount.rdd_show(sentences, "=====Loaded Sentences=====")
    return sentences
Beispiel #2
0
    """Converts the dataset RDD to a dataFrame and saves it as multiple JSON files.

    Params:
    - dataset (pyspark.sql.DataFrame): The dataset containing the id, record, preprocessed record, and both feature
                                       sets for each record
    """
    dataset.show()
    dataset.write.json(filename, mode="overwrite")
# End of save_dataset_as_dataframe()


if __name__ == "__main__":
    args = parse_arguments()
    if args.sentences:
        sentences = load_sentences(args.file)
        with open("bag_of_words_labels.json", "r") as bow_file:
            bag_of_words_labels = json.load(bow_file)
        preprocessed_contents = preprocess_records_keep_fields(sentences)
    else:
        records = wordcount.load_records(args.file, False)
        preprocessed_contents = preprocess_records_keep_fields(records)
        if os.path.isfile("bag_of_words_labels.json"):
            print("Loading bag of words labels from file")
            with open("bag_of_words_labels.json", "r") as bow_file:
                bag_of_words_labels = json.load(bow_file)
        else:
            bag_of_words_labels = get_bag_of_words_labels(preprocessed_contents, args)
    feature_sets = preprocessed_contents.map(lambda contents: make_feature_sets(contents, bag_of_words_labels))
    dataset = feature_sets.toDF()
    save_dataset_as_dataframe(dataset, args.output)
Beispiel #3
0
def pos_tag_verbs(records):
    '''
    pos tags verbs and gets most important ones 
    Params:
    -  records (list<str>): the contents of each record stored as a string in a list i.e. a list of strings
    Return:
    - important_words (list):     a list of important verbs  
    '''
    tagged_records = pos_parser(records)
    verbs = verb_tagger(tagged_records)
    tfidf_scores = pos_tfidf_scores(verbs)
    important_words = tfidf.extract_important_words(tfidf_scores, len(verbs))

    return important_words


if __name__ == "__main__":
    args = wordcount.parse_arguments()
    records = wordcount.load_records(args.file)  #dictionary
    records = records.collect()
    contents = list(map(lambda record: record[1][constants.TEXT],
                        records))  #puts records into a list from dictionary

    pos_tagged_records = pos_parser(contents)
    nv_tuple = pos_nv_tagger(pos_tagged_records)
    print("MOST IMPORTANT NOUNS:")
    print(pos_tfidf(nv_tuple[0]))
    print("MOST IMPORTANT VERBS:")
    print(pos_tfidf(nv_tuple[1]))