def process_line(l, phrase): """ This function processes the tagged line, extracting each word and its attributes. Information is stored in a list of dictionaries (one dict per word) where will be filled all the words' attributes. @param l Line read from input to be processed. @param phrase List of dictionaries to be completed. """ global generate_text words = l.split(" ")[:-3] #remove last 3 elements words = " ".join(words)[1:-1].split(" ") # remove parentheses for (iword, word) in enumerate(words): #e.g.: resume+ed:7_VVN try: (s, index, pos) = get_tokens(word) (surface, lemma) = get_surface(s, pos) dic = { d[1]: surface, d[2]: lemma, d[3]: pos, d[4]: '', d[0]: str(iword + 1) } for key in dic.keys(): dic[key] = dic[key].replace(" ", "") # remove spaces dic[key] = strip_xml(dic[key]) if generate_text: # escape vertical bars dic[key] = dic[key].replace("|", "%%VERTICAL_BAR%%") phrase[int(index)] = dic except IndexError: warn("Line \"%s\" could not be processed as sentence" % l.strip())
def process_line( l, phrase ): """ This function processes the tagged line, extracting each word and its attributes. Information is stored in a list of dictionaries (one dict per word) where will be filled all the words' attributes. @param l Line read from input to be processed. @param phrase List of dictionaries to be completed. """ global generate_text words = l.split(" ")[:-3] #remove last 3 elements words = " ".join( words )[1:-1].split( " " ) # remove parentheses for (iword, word) in enumerate( words ) : #e.g.: resume+ed:7_VVN try : (s, index, pos) = get_tokens(word) (surface, lemma) = get_surface(s, pos) dic={d[1]:surface,d[2]:lemma,d[3]:pos,d[4]:'',d[0]:str(iword+1)} for key in dic.keys() : dic[key] = dic[key].replace(" ","") # remove spaces dic[key] = strip_xml(dic[key]) if generate_text : # escape vertical bars dic[key] = dic[key].replace( "|","%%VERTICAL_BAR%%" ) phrase[ int(index) ] = dic except IndexError: warn( "Line \"%s\" could not be processed as sentence" % l.strip() )
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number os arguments expected for this script. @param usage_string Instructions that appear if you run the program with the wrong parameters or options. """ global morphg_folder global morphg_file global generate_text treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-m", "--morphg"): morphg_folder, morphg_file = os.path.split(a) elif o in ("-x", "--moses"): generate_text = True if not os.path.exists(os.path.join(morphg_folder, morphg_file)): warn("morphg not found !!! - outputting analysed forms") morphg_file = None morphg_folder = None
def treat_options( opts, arg, n_arg, usage_string): """ Callback function that handles the command options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number os arguments expected for this script. @param usage_string Instructions that appear if you run the program with the wrong parameters or options. """ global morphg_folder global morphg_file global generate_text treat_options_simplest( opts, arg, n_arg, usage_string ) for (o, a) in opts: if o in ("-m","--morphg"): morphg_folder, morphg_file = os.path.split( a ) elif o in ("-x","--moses"): generate_text = True if not os.path.exists( os.path.join( morphg_folder, morphg_file ) ) : warn( "morphg not found !!! - outputting analysed forms" ) morphg_file = None morphg_folder = None
def simplify_palavras(pos): """ Receives as input a complex POS tag in the Penn Treebank format (used by treetagger) and return a simplified version of the same tag. @param pos A string representing the POS tag in PTB format @return A string representing the simplified POS tag """ # The "split" part is to avoid that multiple POS like NNS|JJ are not # converted. We simply take the first POS, ignoring the second one. # This is useful when processing the GENIA corpus global palavras_table newpos = pos.split("|")[0] if pos == "N" or pos == "V" or pos == "PCT" or pos == "NUM": newpos = pos elif "-" in pos or ">" in pos: newpos = "UKN" else: try: newpos = palavras_table[newpos] except Exception: warn("part of speech " + str(newpos) + " not converted.") return newpos
def simplify_ptb(pos): """ Receives as input a complex POS tag in the Penn Treebank format (used by treetagger) and return a simplified version of the same tag. @param pos A string representing the POS tag in PTB format @return A string representing the simplified POS tag """ global ptb_table # The "split" part is to avoid that multiple POS like NNS|JJ are not # converted. We simply take the first POS, ignoring the second one. # This is useful when processing the GENIA corpus newpos = pos.split("|")[0] if newpos.startswith("N") or newpos.startswith("V"): # NOUNS / VERBS newpos = newpos[0] elif newpos.startswith("J"): # ADJECTIVES newpos = "A" elif "RB" in newpos: # ADVERBS newpos = "R" elif "DT" in newpos: # DETERMINERS newpos = "DT" elif "CC" in newpos: # CONJUNCTIONS newpos = "CC" elif newpos.startswith("PRP") or newpos.startswith( "PP") or newpos.startswith("WP"): # PRONOUNS newpos = "PP" elif newpos in "\"()':?-/$.,": # ADVERBS newpos = "PCT" else: try: newpos = ptb_table[newpos] except Exception: warn("part of speech " + str(newpos) + " not converted.") return newpos
def destroy_shelve(shlv, path): """ Destoys a shelve and removes its file. """ shlv.clear() shlv.close() try: os.remove(path) except OSError: os.remove(path + ".db") except Exception as err: warn("Error removing temporary file: " + str(err))
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string printed if the arguments are wrong. """ global first_header global first_rater global calculate_pairwise global calculate_confusion global separator global distances_matrix global unknown treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-r", "--raters"): verbose("First row in file ignored -> considered as rater labels") first_header = True if o in ("-i", "--items"): verbose( "First column in file ignored -> considered as item labels") first_rater = 1 if o in ("-p", "--pairwise"): verbose("Computing pairwise coefficients") calculate_pairwise = True if o in ("-u", "--unknown"): verbose("Unknown value - TODO: implement: " + a) unknown = a if o in ("-s", "--separator"): verbose("Field separator: " + a) separator = a if len(separator) > 1: warn("Multi-char field separator!") if o in ("-d", "--distance"): verbose("Calculating weighted coefficients using distance file") distances_matrix = read_distances(a) if distances_matrix is None: warn( "Error in distance matrix! Weighted coefficients will use 1.0 as default distance" ) if o in ("-c", "--confusion"): verbose("Calculating confusion matrices") calculate_confusion = True
def calculate_distances(distances_map, all_categories): """ Generates a distances matrix from the distances map and the correspondence between nominal categories and their IDs. This function is called just after reading the data when a distances file is provided. @param distances_map A dictionary where the keys are strings of the form category1###SEPARATOR###category2 and the values are the distances between category1 and category1 @param all_categories A dictionary where the keys are the string nominal category names and the values are the integer unique IDs of each category @return A simmetric matrix Nk x Nk, with the distance between categories represented in the cells. The rows and columns are indexed with the IDs from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values not specified in the distances_map are set to the maximum distance seen in the map by default. If no distance file is provided (distance_map is empty) the distances between each two different categories are 1.0. """ Nk = len(all_categories.keys()) distances_matrix = [] max_distance = 0.0 for k in range(Nk): distances_matrix.append(Nk * [-1.0]) for key, distance in distances_map.items(): cats = key.split("###SEPARATOR###") try: k1, k2 = map(lambda x: all_categories[x], cats) except KeyError: error("Distance file incompatible with annotations\nDid not find " "categories %s in the annotation data" % cats) return None if k1 == k2: warn( "defined distance for category and self for %s. Replacing by 0" % cats[0]) distances_matrix[k1][k2] = distance distances_matrix[k2][k1] = distance if distance > max_distance: max_distance = distance if len(distances_map.keys()) == 0: max_distance = 1.0 # Fill in the non-specified distances with the maximal value for k1 in range(Nk): distances_matrix[k1][k1] = 0.0 # Distance between categ and itself = 0 for k2 in range(Nk): if distances_matrix[k1][k2] < 0.0: # Not specified or negative distances_matrix[k1][k2] = max_distance return distances_matrix
def calculate_distances( distances_map, all_categories ) : """ Generates a distances matrix from the distances map and the correspondence between nominal categories and their IDs. This function is called just after reading the data when a distances file is provided. @param distances_map A dictionary where the keys are strings of the form category1###SEPARATOR###category2 and the values are the distances between category1 and category1 @param all_categories A dictionary where the keys are the string nominal category names and the values are the integer unique IDs of each category @return A simmetric matrix Nk x Nk, with the distance between categories represented in the cells. The rows and columns are indexed with the IDs from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values not specified in the distances_map are set to the maximum distance seen in the map by default. If no distance file is provided (distance_map is empty) the distances between each two different categories are 1.0. """ Nk = len( all_categories.keys() ) distances_matrix = [] max_distance = 0.0 for k in range(Nk) : distances_matrix.append( Nk * [-1.0] ) for key,distance in distances_map.items() : cats = key.split("###SEPARATOR###") try : k1,k2 = map(lambda x : all_categories[x], cats ) except KeyError : error("Distance file incompatible with annotations\nDid not find " "categories %s in the annotation data" % cats) return None if k1 == k2 : warn("defined distance for category and self for %s. Replacing by 0" % cats[ 0 ]) distances_matrix[ k1 ][ k2 ] = distance distances_matrix[ k2 ][ k1 ] = distance if distance > max_distance : max_distance = distance if len( distances_map.keys() ) == 0 : max_distance = 1.0 # Fill in the non-specified distances with the maximal value for k1 in range(Nk) : distances_matrix[k1][k1] = 0.0 # Distance between categ and itself = 0 for k2 in range(Nk) : if distances_matrix[ k1 ][ k2 ] < 0.0 : # Not specified or negative distances_matrix[ k1 ][ k2 ] = max_distance return distances_matrix
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. @param usage_string The usage string printed if the arguments are wrong. """ global first_header global first_rater global calculate_pairwise global calculate_confusion global separator global distances_matrix global unknown treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ("-r", "--raters") : verbose( "First row in file ignored -> considered as rater labels") first_header = True if o in ("-i", "--items") : verbose("First column in file ignored -> considered as item labels") first_rater = 1 if o in ("-p", "--pairwise") : verbose( "Computing pairwise coefficients" ) calculate_pairwise = True if o in ("-u", "--unknown") : verbose( "Unknown value - TODO: implement: " + a ) unknown = a if o in ("-s", "--separator") : verbose( "Field separator: " + a ) separator = a if len( separator ) > 1 : warn("Multi-char field separator!") if o in ("-d", "--distance") : verbose("Calculating weighted coefficients using distance file") distances_matrix = read_distances( a ) if distances_matrix is None : warn("Error in distance matrix! Weighted coefficients will use 1.0 as default distance") if o in ("-c", "--confusion") : verbose( "Calculating confusion matrices" ) calculate_confusion = True
def process_tree_branch(l, phrase): """ This function processed the dependency tree that follows each tagged sentence. Information to be retrieved from here is just the 'syn' attribute, corresponding to the relations between father/son words. @param l Line read from input to be processed @param phrase List of dictionaries to be completed """ parts = l.strip().replace(" _", "").replace("(", "").replace(")", "").split(" ") rel = "" members = [] for part in parts: if ":" not in part: rel = rel + "_" + part.replace("|", "") else: members.append(part) # First char is _ # Also remove ; and : from rel, since they have special meanings in format rel = rel[1:].replace(";", "SEMICOLON").replace(":", "COLON") if len(members) >= 1: if len(members) >= 2: # binary (typical) dependency relation # This line below converts RASP's token IDs into token positions in # moses format. This is required because sometimes RASP skips words # and assigns e.g. 1 2 4 5, so dependency 2->4 should be converted # into 2->3 in new sentence 1 2 3 4. head = phrase[int(get_tokens(members[0])[1])]["index"] syn = rel + ":" + head if len(members) == 3: syn = syn + ";" + rel + ":" + get_tokens(members[1])[1] son = get_tokens(members[-1])[1] entry = phrase.get(int(son), None) else: # simple property: passive, have_to, etc. word_index = get_tokens(members[0])[1] entry = phrase.get(int(word_index), None) syn = rel if entry and syn: if entry["syn"] == "": entry["syn"] = syn else: entry["syn"] = entry["syn"] + ";" + syn else: warn("Unrecogized grammatical relation \"%s\"" % l.strip())
def process_tree_branch(l, phrase): """ This function processed the dependency tree that follows each tagged sentence. Information to be retrieved from here is just the 'syn' attribute, corresponding to the relations between father/son words. @param l Line read from input to be processed @param phrase List of dictionaries to be completed """ parts = l.strip().replace( " _", "" ).replace( "(", "" ).replace( ")", "" ).split( " " ) rel = "" members = [] for part in parts : if ":" not in part : rel = rel + "_" + part.replace("|","") else: members.append( part ) # First char is _ # Also remove ; and : from rel, since they have special meanings in format rel = rel[1:].replace( ";", "SEMICOLON").replace( ":", "COLON" ) if len(members) >= 1 : if len(members) >= 2: # binary (typical) dependency relation # This line below converts RASP's token IDs into token positions in # moses format. This is required because sometimes RASP skips words # and assigns e.g. 1 2 4 5, so dependency 2->4 should be converted # into 2->3 in new sentence 1 2 3 4. head = phrase[ int( get_tokens( members[0] )[1] ) ][ "index" ] syn = rel + ":" + head if len(members) == 3 : syn = syn + ";" + rel + ":" + get_tokens( members[1] )[1] son = get_tokens( members[-1] )[1] entry = phrase.get( int(son), None ) else: # simple property: passive, have_to, etc. word_index = get_tokens( members[ 0 ] )[ 1 ] entry = phrase.get( int(word_index), None ) syn = rel if entry and syn : if entry[ "syn" ] == "" : entry[ "syn" ] = syn else : entry[ "syn" ] = entry[ "syn" ] + ";" + syn else : warn( "Unrecogized grammatical relation \"%s\"" % l.strip() )
def treat_options( opts, arg, n_arg, usage_string ) : """Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global input_patterns global input_filetype_ext global output_filetype_ext global match_distance global non_overlapping global id_order global annotate global only_the_matching_subpart util.treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o == "--input-from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a elif o in ("-p", "--patterns"): input_patterns = filetype.parse_entities([a]) elif o in ("-d", "--match-distance") : match_distance = a elif o in ("-N", "--non-overlapping") : non_overlapping = True elif o == "--id-order": id_order = a.split(":") elif o == "--annotate": annotate = True elif o == "--only-matching": only_the_matching_subpart = True else: raise Exception("Bad arg " + o) if input_patterns is None: util.error("No patterns provided. Option --patterns is mandatory!") if only_the_matching_subpart and annotate: util.warn("Switch --only-matching disables --annotate")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global feat_list global ascending global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) a_or_d = [] for (o, a) in opts: if o in ("-f", "--feat"): #import pdb #pdb.set_trace() feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg") if len(a_or_d) > 1: warn("You must provide only one option, -a OR -d. " \ "Only the last one will be considered.")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global feat_list global ascending global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) a_or_d = [] for ( o, a ) in opts: if o in ("-f", "--feat"): #import pdb #pdb.set_trace() feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg") if len(a_or_d) > 1: warn("You must provide only one option, -a OR -d. " \ "Only the last one will be considered.")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global feat_list global ascending global print_precs treat_options_simplest(opts, arg, n_arg, usage_string) a_or_d = [] for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one"+\ " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f")
def get_percents( token_stats ) : """ Given a vocabulary entry for a given word key, returns a dictionary containing the corresponding percents, i.e. the proportion of a given occurrence wrt to all occurrences of that word. For instance: `token_stats` = { "The": 100, "the": 350, "THE": 50 } will return { "The": .2, "the": .7, "THE": .1 } meaning that the word "the" occurrs 20% of the times in Firstupper configuration, 70% in lowercase and 10% in UPPERCASE. The sum of all dictionary values in the result is 1. Forms occurring at the beginning of a sentence or after a period are ignored, since they might have case modifications due to their position. @param token_stats A vocabulary entry that associates case configurations to an integer number of occurrences. @param token_stats A dictionary that associates case configurations to a float percent value equal to the number of occurrences of that configuration divided by the total number of occurrences of that word. """ percents = {} total_count = 0 for a_form in token_stats.keys() : count = percents.get( a_form, 0 ) count_notstart = token_stats[ a_form ][ 0 ] - token_stats[ a_form ][ 1 ] # Smoothing to avoid division by zero (occurs ONLY in first position) # Add-one smoothing is simple and solves the problem count_notstart += 1 count = count + count_notstart percents[ a_form ] = count total_count = total_count + count_notstart for a_form in percents.keys() : if total_count != 0 : percents[ a_form ] = percents[ a_form ] / float(total_count) else : warn("Percents cannot be calculated for non-occurring words!") return percents
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global feat_list global ascending global print_precs treat_options_simplest(opts, arg, n_arg, usage_string) a_or_d = [] for (o, a) in opts: if o in ("-f", "--feat"): feat_list = treat_feat_list(a) elif o in ("-a", "--asc"): ascending = True a_or_d.append("a") elif o in ("-d", "--desc"): ascending = False a_or_d.append("d") elif o in ("-p", "--precs"): print_precs = True elif o == "--from": input_filetype_ext = a else: raise Exception("Bad arg: " + o) if len(a_or_d) > 1: warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.") if not feat_list: error("You MUST provide at least one feature with -f")
def get_freq_web1t(surfaces, lemmas, pos): """ Gets the frequency (number of occurrences) of an ngram in Google's Web 1T 5-gram Corpus. """ global build_entry, web1t_data_path length = len(surfaces) if length > 5: warn("Cannot count the frequency of an n-gram, n>5!") return 0 search_term = ' '.join(map(build_entry, surfaces, lemmas, pos)) # Find the file in which to look for the ngram. if length == 1: filename = web1t_data_path + "/1gms/vocab.gz" else: indexfile = web1t_data_path + "/%dgms/%dgm.idx" % (length, length) filenames = [x.split("\t") for x in read_file(indexfile).split("\n")] filename = None for (name, first) in filenames: # Assumes byte-value-based ordering! if first > search_term: break else: filename = name if filename is None: return 0 filename = "%s/%dgms/%s" % (web1t_data_path, length, filename) verbose("WEB1T: Opening %s, looking for %s" % (filename, search_term)) # This has been absurdly slow in Python. #file = gzip.open(filename, "rb") # #search_term += "\t" #freq = 0 # #for line in file: # if line.startswith(search_term): # freq = int(line.split("\t")[1]) # break # #print >>sys.stderr, "buenito: %d" % freq # #file.close() file = subprocess.Popen( ["zgrep", "--", "^" + re.escape(search_term) + "\t", filename], stdout=subprocess.PIPE).stdout line = file.read() file.close() if line: freq = int(line.split("\t")[1]) else: freq = 0 verbose("freq =" + str(freq)) return freq
def calculate_ams(o, m_list, N, corpus_name): """ Given a joint frequency of the ngram, a list of individual frequencies, a corpus size and a corpus name, generates a list of `Features`, each containing the value of an Association Measure. @param o The float value corresponding to the number of occurrences of the ngram. @param m_list A list of float values corresponding to the number of occurrences of each of the words composing the ngram. The list should NEVER be empty, otherwise the result is undefined. @param N The float value corresponding to the number of tokens in the corpus, i.e. its total size. The size of the corpus should NEVER be zero, otherwise the result is undefined. @param corpus_name A string that uniquely identifies the corpus from which the counts were drawn. """ # N is never null!!! # m_list is never empty!!! global measures, heuristic_combine, not_normalize_mle, warn_ll_bigram_only feats = [] f_sum = 0 n = len(m_list) e = expect(m_list, N) if "mle" in measures: if not_normalize_mle: mle = int(o) else: mle = o / N feats.append(Feature("mle_" + corpus_name, mle)) if "pmi" in measures: if e != 0 and o != 0: pmi = math.log(o / e, 2) else: pmi = 0.0 feats.append(Feature("pmi_" + corpus_name, pmi)) if "t" in measures: if o != 0.0: t = (o - e) / math.sqrt(o) else: t = 0.0 feats.append(Feature("t_" + corpus_name, t)) if "dice" in measures: if sum(m_list) != 0.0: dice = (n * o) / sum(m_list) else: dice = 0.0 feats.append(Feature("dice_" + corpus_name, dice)) if "ll" in measures: #pdb.set_trace() if len(m_list) == 2: # Contingency tables observed, expected (ct_os, ct_es) = contingency_tables([o], m_list, N, corpus_name) ll_list = [] # Calculation is suitable for generic ngrams for (ct_o, ct_e) in map(None, ct_os, ct_es): ll = 0.0 for i in range(2): for j in range(2): if ct_o[i][j] != 0.0: ll += ct_o[i][j] * (math.log(ct_o[i][j], 10) - math.log(ct_e[i][j], 10)) ll *= 2 ll_list.append(ll) ll_final = heuristic_combine(ll_list) else: if warn_ll_bigram_only: warn_ll_bigram_only = False warn("log-likelihood is only implemented for 2grams. " "Defaults to 0.0 for n>2") ll_final = 0.0 feats.append(Feature("ll_" + corpus_name, ll_final)) return feats
def contingency_tables( bigram_freqs, unigram_freqs, N, corpus_name ): """ Given an ngram (generic n) w_1 ... w_n, the input is a couple of lists containing integer frequencies, the output is a couple of lists with contingency tables. The first list contains bigram frequencies [ f(w_1 w_2), f(w_2 w_3), ..., f(w_n-1 w_n) ]. The second list contains unigram frequencies [ f(w_1), f(w_2), ..., f(w_n) ]. While the first list contains n-1 elements, the second list contains n elements. The result is a couple of lists with contingency tables, the first corresponds to the observed frequencies, the second to expected frequencies. The contingency tables are 2D lists that contain the 4 possible outcomes for the occurrence of a bigram, i.e. c(w1 w2), c(w1 ~w2), c(~w1 w2) and c(~w1 ~w2), where "~w" means "any word but w". Observed contingency tables are exact calculations based on simple set operations (intersection, difference). The expected frequencies are calculated using maximum likelihood for independent events (e.g. the occurrence of w1 does not change the probability of the occurrence of w2 or of ~w2 imediately after w1, also noted P(w2|w1)=P(w2)). @param bigram_freqs List of integers representing bigram frequencies. Notice that no bigram can occur more than the words that is contains. Any inconsistency will be automatically corrected and output as a warning. This list should contain n-1 elements. @param unigram_freqs List of integers representing unigram (word) frequencies. This list should have n elements. @param corpus_name The name of the corpus from which frequencies were drawn. This is only used in verbose mode to provide friendly output messages. @return a couple (observed, expected), where observed and expected are lists, both of size n-1, and each cell of each list contains a 2x2 table with observed and expected contingency tables for the bigrams given as input. """ observed = [] expected = [] n = len( unigram_freqs ) if len( bigram_freqs ) != n - 1 : warn( "Invalid unigram/bigram frequencies passed to " "calculate_negations function") return None # 1) Verify that all the frequencies are valid for i in range( len( bigram_freqs ) ) : if bigram_freqs[ i ] > unigram_freqs[ i ] or \ bigram_freqs[ i ] > unigram_freqs[ i + 1 ] : warn( corpus_name + " unigrams must occur at least as much as bigram.") if bigram_freqs[ i ] > unigram_freqs[ i ] : warn("Automatic correction: " + \ str( unigram_freqs[ i ] ) + " -> " + \ str( bigram_freqs[ i ] )) unigram_freqs[ i ] = bigram_freqs[ i ] if bigram_freqs[ i ] > unigram_freqs[ i + 1 ] : warn("Automatic correction: " + \ str( unigram_freqs[ i + 1 ] ) + " -> " + \ str( bigram_freqs[ i ] )) unigram_freqs[ i + 1 ] = bigram_freqs[ i ] # 2) Calculate negative freqs for i in range( len( bigram_freqs ) ) : o = [ 2 * [ -1 ], 2 * [ -1 ] ] e = [ 2 * [ -1 ], 2 * [ -1 ] ] cw1 = unigram_freqs[ i ] cw2 = unigram_freqs[ i + 1 ] cw1w2 = bigram_freqs[ i ] o[ 0 ][ 0 ] = cw1w2 e[ 0 ][ 0 ] = expect( [ cw1, cw2 ], N ) o[ 0 ][ 1 ] = cw1 - cw1w2 e[ 0 ][ 1 ] = expect( [ cw1, N - n + 1 - cw2 ], N ) o[ 1 ][ 0 ] = cw2 - cw1w2 e[ 1 ][ 0 ] = expect( [ N - n + 1 - cw1, cw2 ], N ) # BEWARE! THERE WAS A HUGE ERROR HERE, CORRECTED ON APRIL 18, 2012 # ALL LOG-LIKELIHOOD VALUES CALCULATED BY THE TOOLKIT WERE WRONG! # PLEASE RE-RUN IF YOU USED THE OLD VERSION! o[ 1 ][ 1 ] = N - len( unigram_freqs ) + 1 - cw1 - cw2 + cw1w2 e[ 1 ][ 1 ] = expect( [ N - n + 1 - cw1, N - n + 1 - cw2 ], N ) observed.append( o ) expected.append( e ) return (observed, expected)
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global thresh_source global thresh_value global equals_name global equals_value global reverse global minlength global maxlength global min_mweoccurs global max_mweoccurs global input_filetype_ext global output_filetype_ext treat_options_simplest(opts, arg, n_arg, usage_string) for (o, a) in opts: if o in ("-t", "--threshold"): threshold = interpret_threshold(a) if threshold: (thresh_source, thresh_value) = threshold else: error("The format of the -t argument must be <source>:" "<value>\n<source> must be a valid corpus name and " "<value> must be a non-negative integer") elif o in ("-e", "--equals"): equals = interpret_equals(a) if equals: (equals_name, equals_value) = equals else: error("The format of the -e argument must be <name>:" "<value>\n<name> must be a valid feat name and " "<value> must be a non-empty string") elif o in ("-p", "--patterns"): verbose("Reading patterns file") global patterns patterns = filetype.parse_entities([a]) elif o in ("-r", "--reverse"): reverse = True verbose("Option REVERSE active") elif o in ("-i", "--minlength"): minlength = interpret_length(a, "minimum") elif o in ("-a", "--maxlength"): maxlength = interpret_length(a, "maximum") elif o == "--min-mweoccurs": min_mweoccurs = interpret_length(a, "minimum") elif o == "--max-mweoccurs": max_mweoccurs = interpret_length(a, "maximum") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if minlength > maxlength: warn("minlength should be <= maxlength") if min_mweoccurs > max_mweoccurs: warn("min-mweoccurs should be <= max-mweoccurs")
def treat_options(opts, arg, n_arg, usage_string): """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global patterns global ignore_pos global surface_instead_lemmas global print_cand_freq global print_source global match_distance global non_overlapping global input_filetype_ext global output_filetype_ext global id_order treat_options_simplest(opts, arg, n_arg, usage_string) mode = [] patterns_file = None for (o, a) in opts: if o in ("-p", "--patterns"): mode.append("patterns") patterns_file = a elif o in ("-n", "--ngram"): create_patterns_file(a) mode.append("ngram") elif o in ("-g", "--ignore-pos"): ignore_pos = True elif o in ("-d", "--match-distance"): match_distance = a elif o in ("-N", "--non-overlapping"): non_overlapping = True elif o in ("-s", "--surface"): surface_instead_lemmas = True elif o in ("-S", "--source"): print_source = True elif o in ("-f", "--freq"): print_cand_freq = True elif o in ("-i", "--index"): input_filetype_ext = "BinaryIndex" warn("Option -i is deprecated; use --from=BinaryIndex") elif o == "--id-order": id_order = a.split(":") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad flag") if non_overlapping and match_distance == "All": # If we are taking all matches, we need to be able to overlap... error( "Conflicting options: --match-distance=All and --non-overlapping") if len(mode) != 1: error("Exactly one option, -p or -n, must be provided") if "patterns" in mode: global patterns patterns = filetype.parse_entities([patterns_file])
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global patterns global ignore_pos global surface_instead_lemmas global print_cand_freq global print_source global match_distance global non_overlapping global input_filetype_ext global output_filetype_ext global id_order treat_options_simplest( opts, arg, n_arg, usage_string ) mode = [] patterns_file = None for ( o, a ) in opts: if o in ("-p", "--patterns") : mode.append( "patterns" ) patterns_file = a elif o in ( "-n", "--ngram" ) : create_patterns_file( a ) mode.append( "ngram" ) elif o in ("-g", "--ignore-pos") : ignore_pos = True elif o in ("-d", "--match-distance") : match_distance = a elif o in ("-N", "--non-overlapping") : non_overlapping = True elif o in ("-s", "--surface") : surface_instead_lemmas = True elif o in ("-S", "--source") : print_source = True elif o in ("-f", "--freq") : print_cand_freq = True elif o in ("-i", "--index") : input_filetype_ext = "BinaryIndex" warn("Option -i is deprecated; use --from=BinaryIndex") elif o == "--id-order": id_order = a.split(":") elif o == "--from" : input_filetype_ext = a elif o == "--to" : output_filetype_ext = a else: raise Exception("Bad flag") if non_overlapping and match_distance == "All": # If we are taking all matches, we need to be able to overlap... error("Conflicting options: --match-distance=All and --non-overlapping") if len(mode) != 1 : error("Exactly one option, -p or -n, must be provided") if "patterns" in mode: global patterns patterns = filetype.parse_entities([patterns_file])
def calculate_ams( o, m_list, N, corpus_name ) : """ Given a joint frequency of the ngram, a list of individual frequencies, a corpus size and a corpus name, generates a list of `Features`, each containing the value of an Association Measure. @param o The float value corresponding to the number of occurrences of the ngram. @param m_list A list of float values corresponding to the number of occurrences of each of the words composing the ngram. The list should NEVER be empty, otherwise the result is undefined. @param N The float value corresponding to the number of tokens in the corpus, i.e. its total size. The size of the corpus should NEVER be zero, otherwise the result is undefined. @param corpus_name A string that uniquely identifies the corpus from which the counts were drawn. """ # N is never null!!! # m_list is never empty!!! global measures, heuristic_combine, not_normalize_mle, warn_ll_bigram_only feats = [] f_sum = 0 n = len( m_list ) e = expect( m_list, N ) if "mle" in measures : if not_normalize_mle : mle = int( o ) else : mle = o / N feats.append( Feature( "mle_" + corpus_name, mle ) ) if "pmi" in measures : if e != 0 and o != 0: pmi = math.log( o / e, 2 ) else : pmi = 0.0 feats.append( Feature( "pmi_" + corpus_name, pmi ) ) if "t" in measures : if o != 0.0 : t = ( o - e ) / math.sqrt( o ) else : t = 0.0 feats.append( Feature( "t_" + corpus_name, t ) ) if "dice" in measures : if sum( m_list ) != 0.0 : dice = ( n * o ) / sum( m_list ) else : dice = 0.0 feats.append( Feature( "dice_" + corpus_name, dice ) ) if "ll" in measures : #pdb.set_trace() if len( m_list ) == 2 : # Contingency tables observed, expected ( ct_os, ct_es ) = contingency_tables( [o], m_list, N, corpus_name ) ll_list = [] # Calculation is suitable for generic ngrams for (ct_o, ct_e) in map( None, ct_os, ct_es ) : ll = 0.0 for i in range( 2 ) : for j in range( 2 ) : if ct_o[i][j] != 0.0 : ll += ct_o[i][j] * ( math.log( ct_o[i][j], 10 ) - math.log( ct_e[i][j], 10 ) ) ll *= 2 ll_list .append( ll ) ll_final = heuristic_combine( ll_list ) else : if warn_ll_bigram_only: warn_ll_bigram_only = False warn("log-likelihood is only implemented for 2grams. " "Defaults to 0.0 for n>2") ll_final = 0.0 feats.append( Feature( "ll_" + corpus_name, ll_final ) ) return feats
def treat_options( opts, arg, n_arg, usage_string ) : """ Callback function that handles the command line options of this script. @param opts The options parsed by getopts. Ignored. @param arg The argument list parsed by getopts. @param n_arg The number of arguments expected for this script. """ global thresh_source global thresh_value global equals_name global equals_value global reverse global minlength global maxlength global min_mweoccurs global max_mweoccurs global input_filetype_ext global output_filetype_ext treat_options_simplest( opts, arg, n_arg, usage_string ) for ( o, a ) in opts: if o in ( "-t", "--threshold" ) : threshold = interpret_threshold( a ) if threshold : (thresh_source, thresh_value) = threshold else : error( "The format of the -t argument must be <source>:" "<value>\n<source> must be a valid corpus name and " "<value> must be a non-negative integer") elif o in ( "-e", "--equals" ) : equals = interpret_equals( a ) if equals : ( equals_name, equals_value ) = equals else : error( "The format of the -e argument must be <name>:" "<value>\n<name> must be a valid feat name and " "<value> must be a non-empty string") elif o in ("-p", "--patterns") : verbose( "Reading patterns file" ) global patterns patterns = filetype.parse_entities([a]) elif o in ("-r", "--reverse") : reverse = True verbose("Option REVERSE active") elif o in ("-i", "--minlength") : minlength = interpret_length( a, "minimum" ) elif o in ("-a", "--maxlength") : maxlength = interpret_length( a, "maximum" ) elif o == "--min-mweoccurs": min_mweoccurs = interpret_length(a, "minimum") elif o == "--max-mweoccurs": max_mweoccurs = interpret_length(a, "maximum") elif o == "--from": input_filetype_ext = a elif o == "--to": output_filetype_ext = a else: raise Exception("Bad arg: " + o) if minlength > maxlength: warn("minlength should be <= maxlength") if min_mweoccurs > max_mweoccurs: warn("min-mweoccurs should be <= max-mweoccurs")
def contingency_tables(bigram_freqs, unigram_freqs, N, corpus_name): """ Given an ngram (generic n) w_1 ... w_n, the input is a couple of lists containing integer frequencies, the output is a couple of lists with contingency tables. The first list contains bigram frequencies [ f(w_1 w_2), f(w_2 w_3), ..., f(w_n-1 w_n) ]. The second list contains unigram frequencies [ f(w_1), f(w_2), ..., f(w_n) ]. While the first list contains n-1 elements, the second list contains n elements. The result is a couple of lists with contingency tables, the first corresponds to the observed frequencies, the second to expected frequencies. The contingency tables are 2D lists that contain the 4 possible outcomes for the occurrence of a bigram, i.e. c(w1 w2), c(w1 ~w2), c(~w1 w2) and c(~w1 ~w2), where "~w" means "any word but w". Observed contingency tables are exact calculations based on simple set operations (intersection, difference). The expected frequencies are calculated using maximum likelihood for independent events (e.g. the occurrence of w1 does not change the probability of the occurrence of w2 or of ~w2 imediately after w1, also noted P(w2|w1)=P(w2)). @param bigram_freqs List of integers representing bigram frequencies. Notice that no bigram can occur more than the words that is contains. Any inconsistency will be automatically corrected and output as a warning. This list should contain n-1 elements. @param unigram_freqs List of integers representing unigram (word) frequencies. This list should have n elements. @param corpus_name The name of the corpus from which frequencies were drawn. This is only used in verbose mode to provide friendly output messages. @return a couple (observed, expected), where observed and expected are lists, both of size n-1, and each cell of each list contains a 2x2 table with observed and expected contingency tables for the bigrams given as input. """ observed = [] expected = [] n = len(unigram_freqs) if len(bigram_freqs) != n - 1: warn("Invalid unigram/bigram frequencies passed to " "calculate_negations function") return None # 1) Verify that all the frequencies are valid for i in range(len(bigram_freqs)): if bigram_freqs[ i ] > unigram_freqs[ i ] or \ bigram_freqs[ i ] > unigram_freqs[ i + 1 ] : warn(corpus_name + " unigrams must occur at least as much as bigram.") if bigram_freqs[i] > unigram_freqs[i]: warn("Automatic correction: " + \ str( unigram_freqs[ i ] ) + " -> " + \ str( bigram_freqs[ i ] )) unigram_freqs[i] = bigram_freqs[i] if bigram_freqs[i] > unigram_freqs[i + 1]: warn("Automatic correction: " + \ str( unigram_freqs[ i + 1 ] ) + " -> " + \ str( bigram_freqs[ i ] )) unigram_freqs[i + 1] = bigram_freqs[i] # 2) Calculate negative freqs for i in range(len(bigram_freqs)): o = [2 * [-1], 2 * [-1]] e = [2 * [-1], 2 * [-1]] cw1 = unigram_freqs[i] cw2 = unigram_freqs[i + 1] cw1w2 = bigram_freqs[i] o[0][0] = cw1w2 e[0][0] = expect([cw1, cw2], N) o[0][1] = cw1 - cw1w2 e[0][1] = expect([cw1, N - n + 1 - cw2], N) o[1][0] = cw2 - cw1w2 e[1][0] = expect([N - n + 1 - cw1, cw2], N) # BEWARE! THERE WAS A HUGE ERROR HERE, CORRECTED ON APRIL 18, 2012 # ALL LOG-LIKELIHOOD VALUES CALCULATED BY THE TOOLKIT WERE WRONG! # PLEASE RE-RUN IF YOU USED THE OLD VERSION! o[1][1] = N - len(unigram_freqs) + 1 - cw1 - cw2 + cw1w2 e[1][1] = expect([N - n + 1 - cw1, N - n + 1 - cw2], N) observed.append(o) expected.append(e) return (observed, expected)