Example #1
0
def listening():
  data = request.json
  resp = client.message(data['question'])
  tools.read_text(tools.choose_response(resp["intents"][0]["name"]))
  return {
    "status": "END",
    "request": data['question'],
    "response": resp
  }
Example #2
0
def build_forms_histogram(filename, forms2basic, hist_size=0):
    text = tools.read_text(filename, clean_txt=False)
    words = tools.find_words(text)
    words = [code_digraphs(word_i) for word_i in words]
    forms = [forms2basic[word_i] for word_i in words if (word_i in forms2basic)]
    # print(len(forms))
    forms = tools.unique_list(forms)  # list(forms)
    # print(len(forms))
    # print(forms2basic)
    return build_histogram(forms, laplace_smoothing=True, size=hist_size)
Example #3
0
def build_forms_histogram(filename,forms2basic,hist_size=0):
    text=tools.read_text(filename,clean_txt=False)
    words=tools.find_words(text)    
    words=[code_digraphs(word_i) for word_i in words]
    forms=[ forms2basic[word_i] for word_i in words
                     if(word_i in forms2basic)]
    #print(len(forms))
    forms=tools.unique_list(forms)#list(forms)        
    #print(len(forms))
    #print(forms2basic)
    return build_histogram(forms,laplace_smoothing=True,size=hist_size)   
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", type=str, help="The JSON file to load data from.")
    parser.add_argument("-a", "--adjectives", type=str, help="The list of adjectives to count.")
    parser.add_argument("-o", "--output", type=str,
                        help="The directory to output data files to.", default=getcwd())

    arg = parser.parse_args()

    # Basic sanity checking.
    if arg.input is None:
        print("You need to specify a JSON file to laod.")
        return 1
    if arg.adjectives is None:
        print("You need to specify where to find the adjective list.")
        return 1
    if not(os.path.isfile(arg.input)):
        print("Your input file does not exist.")
        return 1
    if not(os.path.isfile(arg.adjectives)):
        print("Your adjective list does not exist.")
        return 1
    if not(os.path.isdir(arg.output)):
        print("Your output path does not exist.")
        return 1

    data = tools.read_JSON(arg.input)
    adjectives = tools.read_text(arg.adjectives)

    sentence_list = []

    # The count function will expect a list of sentences that does not include file
    # data.
    for item in data:
        if item.sentence.has_pair and item.sentence.speaker.age.decimal < 8:
            sentence_list.append(item.sentence)

    counts = tools.count_from_list(sentence_list, adjectives)

    # Now lets generate the CSV file.
    csv_header = "adjective, prenominal count, postnominal count"
    csv_data = tools.gen_standard_count_CSV(csv_header, counts)
    tools.write_CSV(csv_data, os.path.join(arg.output, 'children.csv'))
Example #5
0
    def _extract_label_from_json(self, json_file: Path) -> str:
        json_text = read_text(json_file, encoding='utf8')
        label_names = ("ORT", "word")
        try:
            j = json.loads(json_text)
            levels = j["levels"]

            def is_level_empty(level: json) -> bool:
                return len(level["items"]) == 0

            def is_level_useful(level: json) -> bool:
                if is_level_empty(level):
                    return False

                return any([label for label in level["items"][0]["labels"] if label["name"] in label_names])

            def word(transcription: json) -> str:
                labels = transcription["labels"]

                matching_labels = [label for label in labels if label["name"] in label_names]

                if len(matching_labels) == 0:
                    raise Exception("No matching label names, found {} instead.".format(
                        [label["name"] for label in labels]))

                matching_label = single(matching_labels)
                return matching_label["value"]

            has_empty_levels = len([level for level in levels if is_level_empty(level)]) != 0

            words = single_or_none([[word(transcription) for
                                     transcription in level["items"]] for level in levels if is_level_useful(level)])

            if words is None and has_empty_levels:
                return ""

            return self._decode_german(" ".join(words))
        except Exception:
            raise ParsingException("Error parsing annotation {}: {}".format(json_file, json_text[:500]))
Example #6
0
def build_word_histogram(filename,forms):
    text=tools.read_text(filename,clean_txt=False)
    words=tools.find_words(text)    
    return build_histogram(words,laplace_smoothing=False)   
Example #7
0
def build_word_histogram(filename, forms):
    text = tools.read_text(filename, clean_txt=False)
    words = tools.find_words(text)
    return build_histogram(words, laplace_smoothing=False)
Example #8
0
def read_clusters(filename):
    txt=tools.read_text(filename,clean_txt=False)
    raw_clusters=txt.split("##########")
    clusters=[parse_clusters(raw_i) for raw_i in raw_clusters]
    return clusters
Example #9
0
    def _extract_label_from_par(self, par_file: Path) -> str:
        par_text = read_text(par_file, encoding='utf8')

        return self._decode_german(" ".join([line[7:] for line in par_text.splitlines() if line.startswith("ORT")]))
Example #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-c",
                        "--child",
                        type=str,
                        help="The JSON file to load child data from.")
    parser.add_argument("-a",
                        "--adult",
                        type=str,
                        help="Directory of where to find adult data.")
    parser.add_argument("-w",
                        "--words",
                        type=str,
                        help="The list of adjectives to count.")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="The directory to output CSV files to.",
                        default=getcwd())

    arg = parser.parse_args()

    # Basic sanity checking.
    if arg.child is None:
        print("You need to specify a JSON file to load.")
        return 1
    if arg.adult is None:
        print(
            "You need to specify where the adult data should get loaded from.")
        return 1
    if not (os.path.isfile(arg.child)):
        print("Your child data file does not exist.")
        return 1
    if not (os.path.isdir(arg.adult)):
        print("Your adult data directory does not exist.")
        return 1
    if not (os.path.isdir(arg.output)):
        print("Your output path for the CSV file does not exist.")
        return 1
    if not (os.path.isfile(arg.words)):
        print("Your adjective list does not exist.")
        return 1

    # Load the child data.
    data = tools.read_JSON(arg.child)
    adjectives = tools.read_text(arg.words)
    child_sentence_list = []
    for item in data:
        if item.sentence.speaker.age.decimal < 8:
            child_sentence_list.append(item.sentence)

    # Load the adult data.
    files = tools.find_orfeo_files(arg.adult)
    adult_sentence_list = []

    # Lets go through the files and
    for file in files:
        orfeo_file = file
        xml_file = file[0:-6] + ".xml"
        speaker_data = tools.read_speaker(xml_file)
        sentence_data = tools.read_sentences(orfeo_file, speaker_data)
        adult_sentence_list.extend(sentence_data)

    # Get word totals.
    child_words = count_words(child_sentence_list, True)
    adult_words = count_words(adult_sentence_list)
    print("The total words in the Child Data is:  " + str(child_words))
    print("The total words in the Adult Data is:  " + str(adult_words))

    # Age bins of 6 month increments
    print("Counting adjectives in child data for each bin.")
    for age_low in range(19, 48, 6):
        age_high = age_low + 6

        child_adjective_count, child_male_count, child_female_count, child_total_count =\
            adj_child_age_counts(child_sentence_list, age_low, age_high, adjectives)
        print("From age " + str(age_low) + " to " + str(age_high) + ":  " +
              str(child_adjective_count) + " total adjectives (" +
              str(child_male_count) + " male, " + str(child_female_count) +
              " female) out of " + str(child_total_count) + " words.")

    print("Counting gendered adjectives in child data.")
    child_male_count, child_female_count = get_gendered_counts(
        child_sentence_list, adjectives)
    print("Male children uttered " + str(child_male_count) +
          " adjectives and female children uttered " +
          str(child_female_count) + " adjectives.")

    print("Counting adjectives in the adult data.")
    adult_adjective_count = adj_adult_age_counts(adult_sentence_list,
                                                 adjectives)
    print("The adult data has:  " + str(adult_adjective_count) + ".")
Example #11
0
def read_clusters(filename):
    txt = tools.read_text(filename, clean_txt=False)
    raw_clusters = txt.split("##########")
    clusters = [parse_clusters(raw_i) for raw_i in raw_clusters]
    return clusters
def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        "--input",
                        type=str,
                        help="Name of the JSON file to load.")
    parser.add_argument(
        "-v",
        "--verified",
        help="Tells the program that the file has been human-verified.",
        action='store_true')
    parser.add_argument("-w",
                        "--whitelist",
                        type=str,
                        help="Text list of known adjectives.")
    parser.add_argument(
        "-b",
        "--blacklist",
        type=str,
        help="Text list of known erroneously tagged adjectives.")
    parser.add_argument("-o",
                        "--output",
                        type=str,
                        help="The directory to output data files to.",
                        default=getcwd())
    parser.add_argument("-t",
                        "--test",
                        help="Test mode, output goes to console.",
                        action='store_true')
    parser.add_argument(
        "-c",
        "--count",
        help="Simply counts the number of sentences in a file.",
        action='store_true')
    parser.add_argument("-l",
                        "--lem",
                        help="Lemmatize the data to extract root words.",
                        action='store_true')
    parser.add_argument("-a",
                        "--age",
                        help="Generates age-specific lists of adjectives.",
                        action='store_true')
    parser.add_argument(
        "-r",
        "--colors",
        help="Processes all the colors and positions for each age group.",
        action='store_true')
    parser.add_argument(
        "-n",
        "--nouns",
        help="Counts noun/adjective occurrences fro each age group.",
        action='store_true')
    parser.add_argument(
        "-p",
        "--repair",
        help="Reprocesses the input file to regroup adjective/noun pairs.",
        action='store_true')

    arg = parser.parse_args()

    # Validate that the user gave the program something to do.
    if arg.input is None:
        print("You must specify a JSON file to load, use '-h' for help.")
        return 1
    if not os.path.isfile(arg.input):
        print("File " + arg.input + " not found or is not a file.")
        return 1
    if not arg.count:
        if os.path.isfile(arg.output):
            print("File " + arg.output +
                  " needs to be a directory to output data to.")
            return 1

    # Load JSON file.
    sentences = tools.read_JSON(arg.input)

    if arg.count:
        print("There are " + str(len(sentences)) + " sentences in this file.")
        return 0

    # Lemmatize the data and generate master adjective lists for all ages.
    if arg.lem:
        for s in sentences:
            s.sentence.lem()

        tools.save_JSON(sentences, arg.output + '/lem-data.json')
        return 0

    # Generate the adjective lists based on the age of 8.
    if arg.age:
        all_lemma = []
        older_lemma = []
        younger_lemma = []
        older_pre_lemma = []
        older_post_lemma = []
        younger_pre_lemma = []
        younger_post_lemma = []

        for s in sentences:
            s.sentence.lem()
            # Note the use of the 'not_needed' variable, this is a placeholder since
            # the function returns both adjectives and lemmas.  We aren't worried about
            # the inflected adjectives, so we can throw it away.
            if s.sentence.speaker.age.decimal >= 8:
                not_needed, temp_lemma = s.sentence.find_adjectives()
                all_lemma.extend(temp_lemma)
                older_lemma.extend(temp_lemma)
                temp_pre, temp_post = s.sentence.get_pre_post_lists()
                older_pre_lemma.extend(temp_pre)
                older_post_lemma.extend(temp_post)
            else:
                not_needed, temp_lemma = s.sentence.find_adjectives()
                all_lemma.extend(temp_lemma)
                younger_lemma.extend(temp_lemma)
                temp_pre, temp_post = s.sentence.get_pre_post_lists()
                younger_pre_lemma.extend(temp_pre)
                younger_post_lemma.extend(temp_post)

        # Now we need to count everything up.
        counts = tools.count_adj(all_lemma, older_lemma, younger_lemma,
                                 older_pre_lemma, older_post_lemma,
                                 younger_pre_lemma, younger_post_lemma)

        # Now lets output the data out.
        header = "Lemma, Full Count, Older, Younger, Older Prenominal, Older Postnominal, Younger Prenominal, Younger Postnominal"
        counts_csv = tools.gen_stat_CSV(header, counts)
        counts_file = arg.output + "/counts.csv"
        tools.write_CSV(counts_csv, counts_file)

        return 0

    # Generate color adjectives only in each position.
    if arg.colors:
        all_colors = []
        older_colors = []
        younger_colors = []
        older_pre_colors = []
        older_post_colors = []
        younger_pre_colors = []
        younger_post_colors = []

        for s in sentences:
            s.sentence.lem()
            # Note the use of the 'not_needed' variable, this is a placeholder since
            # the function returns both adjectives and lemmas.  We aren't worried about
            # the inflected adjectives, so we can throw it away.
            if s.sentence.speaker.age.decimal >= 8:
                temp_older_pre_colors, temp_older_post_colors = s.sentence.get_colors(
                )
                older_pre_colors.extend(temp_older_pre_colors)
                older_post_colors.extend(temp_older_post_colors)
            else:
                temp_younger_pre_colors, temp_younger_post_colors = s.sentence.get_colors(
                )
                younger_pre_colors.extend(temp_younger_pre_colors)
                younger_post_colors.extend(temp_younger_post_colors)

        # Get the list and ready to count for every single color adjective.
        all_colors.extend(older_pre_colors)
        all_colors.extend(older_post_colors)
        all_colors.extend(younger_pre_colors)
        all_colors.extend(younger_post_colors)

        # For completeness produce a list of all colors used in each age group.
        older_colors.extend(older_pre_colors)
        older_colors.extend(older_post_colors)
        younger_colors.extend(younger_pre_colors)
        younger_colors.extend(younger_post_colors)

        # Now we need to count everything up.
        counts = tools.count_adj(all_colors, older_colors, younger_colors,
                                 older_pre_colors, older_post_colors,
                                 younger_pre_colors, younger_post_colors)

        # Now lets output the data out.
        header = "Lemma, Full Count, Older, Younger, Older Prenominal, Older Postnominal, Younger Prenominal, Older Postnominal"
        counts_csv = tools.gen_stat_CSV(header, counts)
        counts_file = arg.output + "/colors.csv"
        tools.write_CSV(counts_csv, counts_file)

        return 0

    # Reprocess the input file to generate new adjective/noun groups.
    if arg.repair:
        non_words_list = tools.read_text(arg.output + "/non-words.txt")
        for s in sentences:
            s.sentence.filter_all(non_words_list)

        tools.save_JSON(sentences, arg.output + "/repaired-data.json")

        return 0

    # Generate the counts of each adjective and noun combinations.
    if arg.nouns:
        # A place for all the adjectives to check.
        all_lemma = []
        older_lemma = []
        younger_lemma = []
        older_pre_lemma = []
        older_post_lemma = []
        younger_pre_lemma = []
        younger_post_lemma = []

        # A place for all the nouns to check.
        all_noun = []

        # Get a complete list of all nouns and adjectives.
        for s in sentences:
            s.sentence.lem()
            # Note the use of the 'not_needed' variable, this is a placeholder since
            # the function returns both adjectives and lemmas.  We aren't worried about
            # the inflected adjectives, so we can throw it away.
            not_needed, temp_lemma = s.sentence.find_adjectives()
            temp_pre_nouns, temp_post_nouns = s.sentence.get_nouns()
            all_lemma.extend(temp_lemma)
            temp_pre, temp_post = s.sentence.get_pre_post_lists()
            all_noun.extend(temp_pre_nouns)
            all_noun.extend(temp_post_nouns)

            if s.sentence.speaker.age.decimal >= 8:
                older_lemma.extend(temp_lemma)
                older_pre_lemma.extend(temp_pre)
                older_post_lemma.extend(temp_post)
            else:
                younger_lemma.extend(temp_lemma)
                younger_pre_lemma.extend(temp_pre)
                younger_post_lemma.extend(temp_post)

        # Get the adjective counts
        counts = tools.count_adj(all_lemma, older_lemma, younger_lemma,
                                 older_pre_lemma, older_post_lemma,
                                 younger_pre_lemma, younger_post_lemma)

        reduced_lemma = list(set(all_lemma))
        reduced_older = list(set(older_lemma))
        reduced_younger = list(set(younger_lemma))
        reduced_older_pre = list(set(older_pre_lemma))
        reduced_older_post = list(set(older_post_lemma))
        reduced_younger_pre = list(set(younger_pre_lemma))
        reduced_younger_post = list(set(younger_post_lemma))

        # Now we need to remove all adjectives from the lists that occur less
        # than 20 times.
        for c in counts:
            if c[1] < 20:
                if c[0] in reduced_lemma: reduced_lemma.remove(c[0])
                if c[0] in reduced_older: reduced_older.remove(c[0])
                if c[0] in reduced_younger: reduced_younger.remove(c[0])
                if c[0] in reduced_older_pre: reduced_older_pre.remove(c[0])
                if c[0] in reduced_older_post: reduced_older_post.remove(c[0])
                if c[0] in reduced_younger_pre:
                    reduced_younger_pre.remove(c[0])
                if c[0] in reduced_younger_post:
                    reduced_younger_post.remove(c[0])

        # Generate a reduced noun set.
        canon_nouns = list(set(all_noun))
        reduced_nouns = canon_nouns[:]
        noun_counts = []
        for n in canon_nouns:
            noun_counts.append((n, all_noun.count(n)))

        for nc in noun_counts:
            if nc[1] <= 4:
                reduced_nouns.remove(nc[0])

        # Count everything.
        matrix, older_matrix, younger_matrix, older_pre_matrix, older_post_matrix, younger_pre_matrix, younger_post_matrix = \
            tools.count_noun_adj(sentences,
                           reduced_lemma,
                           canon_nouns,
                           reduced_older,
                           reduced_younger,
                           reduced_older_pre,
                           reduced_older_post,
                           reduced_younger_pre,
                           reduced_younger_post)

        # Generate the data.
        all_data = tools.noun_adj_matrix_gen_csv(matrix, reduced_lemma,
                                                 reduced_nouns)
        older_data = tools.noun_adj_matrix_gen_csv(older_matrix, reduced_lemma,
                                                   reduced_nouns)
        younger_data = tools.noun_adj_matrix_gen_csv(younger_matrix,
                                                     reduced_lemma,
                                                     reduced_nouns)
        older_pre_data = tools.noun_adj_matrix_gen_csv(older_pre_matrix,
                                                       reduced_lemma,
                                                       reduced_nouns)
        older_post_data = tools.noun_adj_matrix_gen_csv(
            older_post_matrix, reduced_lemma, reduced_nouns)
        younger_pre_data = tools.noun_adj_matrix_gen_csv(
            younger_pre_matrix, reduced_lemma, reduced_nouns)
        younger_post_data = tools.noun_adj_matrix_gen_csv(
            younger_post_matrix, reduced_lemma, reduced_nouns)

        # We need to build the header.
        header = ""
        for n in reduced_nouns:
            header = header + "," + n

        # CSV-ize it!
        all_csv = tools.gen_stat_CSV(header, all_data)
        older_csv = tools.gen_stat_CSV(header, older_data)
        younger_csv = tools.gen_stat_CSV(header, younger_data)
        older_pre_csv = tools.gen_stat_CSV(header, older_pre_data)
        older_post_csv = tools.gen_stat_CSV(header, older_post_data)
        younger_pre_csv = tools.gen_stat_CSV(header, younger_pre_data)
        younger_post_csv = tools.gen_stat_CSV(header, younger_post_data)

        # Output the CSV data to files.
        tools.write_CSV(all_csv, arg.output + "/matrix-all.csv")
        tools.write_CSV(older_csv, arg.output + "/matrix-older.csv")
        tools.write_CSV(younger_csv, arg.output + "/matrix-younger.csv")
        tools.write_CSV(older_pre_csv, arg.output + "/matrix-older-pre.csv")
        tools.write_CSV(older_post_csv, arg.output + "/matrix-older-post.csv")
        tools.write_CSV(younger_pre_csv,
                        arg.output + "/matrix-younger-pre.csv")
        tools.write_CSV(younger_post_csv,
                        arg.output + "/matrix-younger-post.csv")

        return 0

    # List of known correctly tagged adjectives.
    if arg.whitelist is None:
        arg.whitelist = arg.output + '/whitelist.txt'
    adjective_whitelist = tools.read_text(arg.whitelist)
    # List of known erroneously tagged adjectives.
    if arg.blacklist is None:
        arg.blacklist = arg.output + '/blacklist.txt'
    adjective_blacklist = tools.read_text(arg.blacklist)

    # Use human-verified data to build a whitelist and blacklist for review.
    if arg.verified:
        potential_whitelist = [
        ]  # List of possible correctly tagged adjectives.
        potential_blacklist = []  # List of possible badly tagged adjectives.

        for st in sentences:
            temp_blacklist = st.sentence.find_bad()
            if len(temp_blacklist) > 0:
                potential_blacklist.extend(temp_blacklist)
            temp_whitelist = st.sentence.find_adjectives()
            if len(temp_whitelist) > 0:
                potential_whitelist.extend(temp_whitelist)

        for word in potential_whitelist:
            if not word in potential_blacklist:
                adjective_whitelist.append(word.lower())
        for word in potential_blacklist:
            if not word in potential_whitelist:
                adjective_blacklist.append(word.lower())

        adjective_whitelist = list(set(adjective_whitelist))
        adjective_whitelist.sort()
        adjective_blacklist = list(set(adjective_blacklist))
        adjective_blacklist.sort()
        tools.save_text(adjective_whitelist, arg.whitelist)
        tools.save_text(adjective_blacklist, arg.blacklist)

        if os.path.isfile(arg.output + '/verified-groups.json'):
            tools.merge_JSON(sentences, arg.output + '/verified-groups.json')
        else:
            tools.save_JSON(sentences, arg.output + '/verified-groups.json')
    # Use generated whitelist and blacklist data to generate a new JSON file for review.
    else:
        verified = []
        to_verify = []
        for st in sentences:
            st.sentence.filter(adjective_whitelist, adjective_blacklist)
            st.sentence.sanitize_words()
            st.sentence.sanitize_sentence()
            st.sentence.find_words()
            if st.sentence.review:
                to_verify.append(st)
            else:
                verified.append(st)

        print("Sentences left to verify:    " + str(len(to_verify)) + ".")
        print("Sentences added as verified: " + str(len(verified)) + ".")

        tools.save_JSON(to_verify, arg.output + '/unverified-groups.json')

        if os.path.isfile(arg.output + '/verified-groups.json'):
            tools.merge_JSON(verified, arg.output + '/verified-groups.json')
        else:
            tools.save_JSON(verified, arg.output + '/verified-groups.json')