Example #1
0
def process_line(l, phrase):
    """ 
        This function processes the tagged line, extracting each word and its 
        attributes. Information is stored in a list of dictionaries (one dict 
        per word) where will be filled all the words' attributes.

        @param l Line read from input to be processed.
        
        @param phrase List of dictionaries to be completed.
    """
    global generate_text
    words = l.split(" ")[:-3]  #remove last 3 elements
    words = " ".join(words)[1:-1].split(" ")  # remove parentheses
    for (iword, word) in enumerate(words):  #e.g.: resume+ed:7_VVN
        try:
            (s, index, pos) = get_tokens(word)
            (surface, lemma) = get_surface(s, pos)
            dic = {
                d[1]: surface,
                d[2]: lemma,
                d[3]: pos,
                d[4]: '',
                d[0]: str(iword + 1)
            }
            for key in dic.keys():
                dic[key] = dic[key].replace(" ", "")  # remove spaces
                dic[key] = strip_xml(dic[key])
                if generate_text:  # escape vertical bars
                    dic[key] = dic[key].replace("|", "%%VERTICAL_BAR%%")
            phrase[int(index)] = dic
        except IndexError:
            warn("Line \"%s\" could not be processed as sentence" % l.strip())
Example #2
0
def process_line( l, phrase ):
    """ 
        This function processes the tagged line, extracting each word and its 
        attributes. Information is stored in a list of dictionaries (one dict 
        per word) where will be filled all the words' attributes.

        @param l Line read from input to be processed.
        
        @param phrase List of dictionaries to be completed.
    """
    global generate_text
    words = l.split(" ")[:-3]                            #remove last 3 elements
    words = " ".join( words )[1:-1].split( " " )         # remove parentheses
    for (iword, word) in enumerate( words ) : #e.g.: resume+ed:7_VVN
        try :
            (s, index, pos) = get_tokens(word)
            (surface, lemma) = get_surface(s, pos)
            dic={d[1]:surface,d[2]:lemma,d[3]:pos,d[4]:'',d[0]:str(iword+1)}
            for key in dic.keys() :
                dic[key] = dic[key].replace(" ","") # remove spaces
                dic[key] = strip_xml(dic[key])
                if generate_text : # escape vertical bars
                    dic[key] = dic[key].replace( "|","%%VERTICAL_BAR%%" )
            phrase[ int(index) ] = dic
        except IndexError:
            warn( "Line \"%s\" could not be processed as sentence" % l.strip() )
Example #3
0
def treat_options(opts, arg, n_arg, usage_string):
    """  
    Callback function that handles the command options of this script.

    @param opts The options parsed by getopts. Ignored.

    @param arg The argument list parsed by getopts.

    @param n_arg The number os arguments expected for this script.

    @param usage_string Instructions that appear if you run the program with
    the wrong parameters or options.
    """
    global morphg_folder
    global morphg_file
    global generate_text
    treat_options_simplest(opts, arg, n_arg, usage_string)
    for (o, a) in opts:
        if o in ("-m", "--morphg"):
            morphg_folder, morphg_file = os.path.split(a)
        elif o in ("-x", "--moses"):
            generate_text = True
    if not os.path.exists(os.path.join(morphg_folder, morphg_file)):
        warn("morphg not found !!! - outputting analysed forms")
        morphg_file = None
        morphg_folder = None
Example #4
0
def treat_options( opts, arg, n_arg, usage_string):
    """  
    Callback function that handles the command options of this script.

    @param opts The options parsed by getopts. Ignored.

    @param arg The argument list parsed by getopts.

    @param n_arg The number os arguments expected for this script.

    @param usage_string Instructions that appear if you run the program with
    the wrong parameters or options.
    """
    global morphg_folder
    global morphg_file
    global generate_text
    treat_options_simplest( opts, arg, n_arg, usage_string )
    for (o, a) in opts:
        if o in ("-m","--morphg"):
            morphg_folder, morphg_file = os.path.split( a )
        elif o in ("-x","--moses"):
            generate_text = True
    if not os.path.exists( os.path.join( morphg_folder, morphg_file ) ) :
        warn( "morphg not found !!! - outputting analysed forms" )
        morphg_file = None
        morphg_folder = None
Example #5
0
def simplify_palavras(pos):
    """
        Receives as input a complex POS tag in the Penn Treebank format (used by 
        treetagger) and return a simplified version of the same tag.
        
        @param pos A string representing the POS tag in PTB format
        
        @return A string representing the simplified POS tag
    """
    # The "split" part is to avoid that multiple POS like NNS|JJ are not
    # converted. We simply take the first POS, ignoring the second one.
    # This is useful when processing the GENIA corpus

    global palavras_table

    newpos = pos.split("|")[0]
    if pos == "N" or pos == "V" or pos == "PCT" or pos == "NUM":
        newpos = pos
    elif "-" in pos or ">" in pos:
        newpos = "UKN"
    else:
        try:
            newpos = palavras_table[newpos]
        except Exception:
            warn("part of speech " + str(newpos) + " not converted.")
    return newpos
Example #6
0
def simplify_ptb(pos):
    """
        Receives as input a complex POS tag in the Penn Treebank format (used by 
        treetagger) and return a simplified version of the same tag.
        
        @param pos A string representing the POS tag in PTB format
        
        @return A string representing the simplified POS tag
    """
    global ptb_table
    # The "split" part is to avoid that multiple POS like NNS|JJ are not
    # converted. We simply take the first POS, ignoring the second one.
    # This is useful when processing the GENIA corpus
    newpos = pos.split("|")[0]
    if newpos.startswith("N") or newpos.startswith("V"):  # NOUNS / VERBS
        newpos = newpos[0]
    elif newpos.startswith("J"):  # ADJECTIVES
        newpos = "A"
    elif "RB" in newpos:  # ADVERBS
        newpos = "R"
    elif "DT" in newpos:  # DETERMINERS
        newpos = "DT"
    elif "CC" in newpos:  # CONJUNCTIONS
        newpos = "CC"
    elif newpos.startswith("PRP") or newpos.startswith(
            "PP") or newpos.startswith("WP"):  # PRONOUNS
        newpos = "PP"
    elif newpos in "\"()':?-/$.,":  # ADVERBS
        newpos = "PCT"
    else:
        try:
            newpos = ptb_table[newpos]
        except Exception:
            warn("part of speech " + str(newpos) + " not converted.")
    return newpos
Example #7
0
def destroy_shelve(shlv, path):
    """
        Destoys a shelve and removes its file.
    """
    shlv.clear()
    shlv.close()
    try:
        os.remove(path)
    except OSError:
        os.remove(path + ".db")
    except Exception as err:
        warn("Error removing temporary file: " + str(err))
Example #8
0
def destroy_shelve(shlv, path):
    """
        Destoys a shelve and removes its file.
    """
    shlv.clear()
    shlv.close()
    try:
        os.remove(path)
    except OSError:
        os.remove(path + ".db")
    except Exception as err:
        warn("Error removing temporary file: " + str(err))
Example #9
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts.        
        @param arg The argument list parsed by getopts.        
        @param n_arg The number of arguments expected for this script.        
        @param usage_string The usage string printed if the arguments are wrong.        
    """

    global first_header
    global first_rater
    global calculate_pairwise
    global calculate_confusion
    global separator
    global distances_matrix
    global unknown

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-r", "--raters"):
            verbose("First row in file ignored -> considered as rater labels")
            first_header = True
        if o in ("-i", "--items"):
            verbose(
                "First column in file ignored -> considered as item labels")
            first_rater = 1
        if o in ("-p", "--pairwise"):
            verbose("Computing pairwise coefficients")
            calculate_pairwise = True
        if o in ("-u", "--unknown"):
            verbose("Unknown value - TODO: implement: " + a)
            unknown = a
        if o in ("-s", "--separator"):
            verbose("Field separator: " + a)
            separator = a
            if len(separator) > 1:
                warn("Multi-char field separator!")
        if o in ("-d", "--distance"):
            verbose("Calculating weighted coefficients using distance file")
            distances_matrix = read_distances(a)
            if distances_matrix is None:
                warn(
                    "Error in distance matrix! Weighted coefficients will use 1.0 as default distance"
                )
        if o in ("-c", "--confusion"):
            verbose("Calculating confusion matrices")
            calculate_confusion = True
Example #10
0
def calculate_distances(distances_map, all_categories):
    """
        Generates a distances matrix from the distances map and the 
        correspondence between nominal categories and their IDs. This function
        is called just after reading the data when a distances file is provided.
        
        @param distances_map A dictionary where the keys are strings of the form
        category1###SEPARATOR###category2 and the values are the distances
        between category1 and category1
        @param all_categories A dictionary where the keys are the string nominal
        category names and the values are the integer unique IDs of each 
        category
        @return A simmetric matrix Nk x Nk, with the distance between categories 
        represented in the cells. The rows and columns are indexed with the IDs
        from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values
        not specified in the distances_map are set to the maximum distance seen
        in the map by default. If no distance file is provided (distance_map is
        empty) the distances between each two different categories are 1.0.
    """
    Nk = len(all_categories.keys())
    distances_matrix = []
    max_distance = 0.0
    for k in range(Nk):
        distances_matrix.append(Nk * [-1.0])
    for key, distance in distances_map.items():
        cats = key.split("###SEPARATOR###")
        try:
            k1, k2 = map(lambda x: all_categories[x], cats)
        except KeyError:
            error("Distance file incompatible with annotations\nDid not find "
                  "categories %s in the annotation data" % cats)
            return None
        if k1 == k2:
            warn(
                "defined distance for category and self for %s. Replacing by 0"
                % cats[0])
        distances_matrix[k1][k2] = distance
        distances_matrix[k2][k1] = distance
        if distance > max_distance:
            max_distance = distance
    if len(distances_map.keys()) == 0:
        max_distance = 1.0
    # Fill in the non-specified distances with the maximal value
    for k1 in range(Nk):
        distances_matrix[k1][k1] = 0.0  # Distance between categ and itself = 0
        for k2 in range(Nk):
            if distances_matrix[k1][k2] < 0.0:  # Not specified or negative
                distances_matrix[k1][k2] = max_distance
    return distances_matrix
Example #11
0
def calculate_distances( distances_map, all_categories ) :
    """
        Generates a distances matrix from the distances map and the 
        correspondence between nominal categories and their IDs. This function
        is called just after reading the data when a distances file is provided.
        
        @param distances_map A dictionary where the keys are strings of the form
        category1###SEPARATOR###category2 and the values are the distances
        between category1 and category1
        @param all_categories A dictionary where the keys are the string nominal
        category names and the values are the integer unique IDs of each 
        category
        @return A simmetric matrix Nk x Nk, with the distance between categories 
        represented in the cells. The rows and columns are indexed with the IDs
        from 0 to Nk-1, the matrix contains 0.0 in the main diagonal. The values
        not specified in the distances_map are set to the maximum distance seen
        in the map by default. If no distance file is provided (distance_map is
        empty) the distances between each two different categories are 1.0.
    """
    Nk = len( all_categories.keys() )
    distances_matrix = []
    max_distance = 0.0
    for k in range(Nk) :
        distances_matrix.append( Nk * [-1.0] )
    for key,distance in distances_map.items() :
        cats = key.split("###SEPARATOR###")
        try :
            k1,k2 = map(lambda x : all_categories[x], cats )
        except KeyError :
            error("Distance file incompatible with annotations\nDid not find "
                  "categories %s in the annotation data" % cats)
            return None
        if k1 == k2 :
            warn("defined distance for category and self for %s. Replacing by 0"
                 % cats[ 0 ])
        distances_matrix[ k1 ][ k2 ] = distance
        distances_matrix[ k2 ][ k1 ] = distance
        if distance > max_distance :
            max_distance = distance
    if len( distances_map.keys() ) == 0 :        
        max_distance = 1.0
    # Fill in the non-specified distances with the maximal value
    for k1 in range(Nk) :
        distances_matrix[k1][k1] = 0.0 # Distance between categ and itself = 0
        for k2 in range(Nk) :
            if distances_matrix[ k1 ][ k2 ] < 0.0 : # Not specified or negative
                distances_matrix[ k1 ][ k2 ] = max_distance     
    return distances_matrix
Example #12
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts.        
        @param arg The argument list parsed by getopts.        
        @param n_arg The number of arguments expected for this script.        
        @param usage_string The usage string printed if the arguments are wrong.        
    """
    
    global first_header
    global first_rater
    global calculate_pairwise
    global calculate_confusion
    global separator
    global distances_matrix
    global unknown
    
    treat_options_simplest( opts, arg, n_arg, usage_string )

    for ( o, a ) in opts:        
        if o in ("-r", "--raters") :
            verbose( "First row in file ignored -> considered as rater labels")
            first_header = True     
        if o in ("-i", "--items") : 
            verbose("First column in file ignored -> considered as item labels")        
            first_rater = 1 
        if o in ("-p", "--pairwise") : 
            verbose( "Computing pairwise coefficients" )
            calculate_pairwise = True
        if o in ("-u", "--unknown") : 
            verbose( "Unknown value - TODO: implement: " + a )
            unknown = a
        if o in ("-s", "--separator") : 
            verbose( "Field separator: " + a )
            separator = a
            if len( separator ) > 1 :
                warn("Multi-char field separator!")
        if o in ("-d", "--distance") :
            verbose("Calculating weighted coefficients using distance file")
            distances_matrix = read_distances( a )
            if distances_matrix is None :
                warn("Error in distance matrix! Weighted coefficients will use 1.0 as default distance")
        if o in ("-c", "--confusion") :
            verbose( "Calculating confusion matrices" )
            calculate_confusion = True
Example #13
0
def process_tree_branch(l, phrase):
    """ 
        This function processed the dependency tree that follows each tagged
        sentence. Information to be retrieved from here is just the 'syn'
        attribute, corresponding to the relations between father/son words.

        @param l Line read from input to be processed
        
        @param phrase List of dictionaries to be completed
    """
    parts = l.strip().replace(" _", "").replace("(", "").replace(")",
                                                                 "").split(" ")
    rel = ""
    members = []
    for part in parts:
        if ":" not in part:
            rel = rel + "_" + part.replace("|", "")
        else:
            members.append(part)
    # First char is _
    # Also remove ; and : from rel, since they have special meanings in format
    rel = rel[1:].replace(";", "SEMICOLON").replace(":", "COLON")
    if len(members) >= 1:
        if len(members) >= 2:  # binary (typical) dependency relation
            # This line below converts RASP's token IDs into token positions in
            # moses format. This is required because sometimes RASP skips words
            # and assigns e.g. 1 2 4 5, so dependency 2->4 should be converted
            # into 2->3 in new sentence 1 2 3 4.
            head = phrase[int(get_tokens(members[0])[1])]["index"]
            syn = rel + ":" + head
            if len(members) == 3:
                syn = syn + ";" + rel + ":" + get_tokens(members[1])[1]
            son = get_tokens(members[-1])[1]
            entry = phrase.get(int(son), None)
        else:  # simple property: passive, have_to, etc.
            word_index = get_tokens(members[0])[1]
            entry = phrase.get(int(word_index), None)
            syn = rel
        if entry and syn:
            if entry["syn"] == "":
                entry["syn"] = syn
            else:
                entry["syn"] = entry["syn"] + ";" + syn
    else:
        warn("Unrecogized grammatical relation \"%s\"" % l.strip())
Example #14
0
def process_tree_branch(l, phrase):
    """ 
        This function processed the dependency tree that follows each tagged
        sentence. Information to be retrieved from here is just the 'syn'
        attribute, corresponding to the relations between father/son words.

        @param l Line read from input to be processed
        
        @param phrase List of dictionaries to be completed
    """
    parts = l.strip().replace( " _", "" ).replace( "(", "" ).replace( ")", "" ).split( " " )
    rel = ""
    members = []
    for part in parts :
        if ":" not in part :
            rel = rel + "_" + part.replace("|","")
        else:
            members.append( part )
    # First char is _
    # Also remove ; and : from rel, since they have special meanings in format
    rel = rel[1:].replace( ";", "SEMICOLON").replace( ":", "COLON" )
    if len(members) >= 1 :
        if len(members) >= 2:    # binary (typical) dependency relation                 
            # This line below converts RASP's token IDs into token positions in
            # moses format. This is required because sometimes RASP skips words
            # and assigns e.g. 1 2 4 5, so dependency 2->4 should be converted
            # into 2->3 in new sentence 1 2 3 4.
            head = phrase[ int( get_tokens( members[0] )[1] ) ][ "index" ] 
            syn = rel + ":" + head 
            if len(members) == 3 :
                syn = syn + ";" + rel + ":" + get_tokens( members[1] )[1]
            son = get_tokens( members[-1] )[1]
            entry = phrase.get( int(son), None )        
        else:                    # simple property: passive, have_to, etc.    
            word_index = get_tokens( members[ 0 ] )[ 1 ]
            entry = phrase.get( int(word_index), None )
            syn = rel
        if entry and syn :
            if entry[ "syn" ] == "" :
                entry[ "syn" ] = syn
            else :
                entry[ "syn" ] = entry[ "syn" ] + ";" + syn
    else :        
        warn( "Unrecogized grammatical relation \"%s\"" % l.strip() )
Example #15
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """Callback function that handles the command line options of this script.
    @param opts The options parsed by getopts. Ignored.
    @param arg The argument list parsed by getopts.
    @param n_arg The number of arguments expected for this script.
    """
    global input_patterns
    global input_filetype_ext
    global output_filetype_ext
    global match_distance
    global non_overlapping
    global id_order
    global annotate
    global only_the_matching_subpart

    util.treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o == "--input-from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        elif o in ("-p", "--patterns"):
            input_patterns = filetype.parse_entities([a])
        elif o in ("-d", "--match-distance") : 
            match_distance = a
        elif o in ("-N", "--non-overlapping") : 
            non_overlapping = True
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--annotate":
            annotate = True
        elif o == "--only-matching":
            only_the_matching_subpart = True
        else:
            raise Exception("Bad arg " + o)

    if input_patterns is None:
        util.error("No patterns provided. Option --patterns is mandatory!")

    if only_the_matching_subpart and annotate:
        util.warn("Switch --only-matching disables --annotate")
Example #16
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global feat_list
    global ascending
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    a_or_d = []
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            #import pdb
            #pdb.set_trace()
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg")

    if len(a_or_d) > 1:
        warn("You must provide only one option, -a OR -d. " \
                "Only the last one will be considered.")
Example #17
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global feat_list
    global ascending
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    a_or_d = []
    for ( o, a ) in opts:
        if o in ("-f", "--feat"):
            #import pdb
            #pdb.set_trace()
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg")

    if len(a_or_d) > 1:
        warn("You must provide only one option, -a OR -d. " \
                "Only the last one will be considered.")
Example #18
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global feat_list
    global ascending
    global print_precs

    treat_options_simplest(opts, arg, n_arg, usage_string)

    a_or_d = []
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one"+\
             " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")
Example #19
0
def get_percents( token_stats ) :
    """
        Given a vocabulary entry for a given word key, returns a dictionary 
        containing the corresponding percents, i.e. the proportion of a given
        occurrence wrt to all occurrences of that word. For instance:
        `token_stats` = { "The": 100, "the": 350, "THE": 50 } will return 
        { "The": .2, "the": .7, "THE": .1 } meaning that the word "the" occurrs
        20% of the times in Firstupper configuration, 70% in lowercase and 10% 
        in UPPERCASE. The sum of all dictionary values in the result is 1.
        
        Forms occurring at the beginning of a sentence or after a period are
        ignored, since they might have case modifications due to their position.
        
        @param token_stats A vocabulary entry that associates case 
        configurations to an integer number of occurrences.
        
        @param token_stats A dictionary that associates case configurations to 
        a float percent value equal to the number of occurrences of that 
        configuration divided by the total number of occurrences of that word.
       
    """
    percents = {}
    total_count = 0
    for a_form in token_stats.keys() :
        count = percents.get( a_form, 0 )
        count_notstart = token_stats[ a_form ][ 0 ] - token_stats[ a_form ][ 1 ]
        # Smoothing to avoid division by zero (occurs ONLY in first position)
        # Add-one smoothing is simple and solves the problem
        count_notstart += 1
        count = count + count_notstart
        percents[ a_form ] = count
        total_count = total_count + count_notstart
    for a_form in percents.keys() :
        if total_count != 0 :
            percents[ a_form ] = percents[ a_form ] / float(total_count)
        else :
            warn("Percents cannot be calculated for non-occurring words!")
    return percents
Example #20
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.

        @param opts The options parsed by getopts. Ignored.

        @param arg The argument list parsed by getopts.

        @param n_arg The number of arguments expected for this script.
    """
    global feat_list
    global ascending
    global print_precs

    treat_options_simplest(opts, arg, n_arg, usage_string)

    a_or_d = []
    for (o, a) in opts:
        if o in ("-f", "--feat"):
            feat_list = treat_feat_list(a)
        elif o in ("-a", "--asc"):
            ascending = True
            a_or_d.append("a")
        elif o in ("-d", "--desc"):
            ascending = False
            a_or_d.append("d")
        elif o in ("-p", "--precs"):
            print_precs = True
        elif o == "--from":
            input_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if len(a_or_d) > 1:
        warn("you should provide only one option, -a OR -d. Only the last one" + " will be considered.")
    if not feat_list:
        error("You MUST provide at least one feature with -f")
Example #21
0
def get_freq_web1t(surfaces, lemmas, pos):
    """
        Gets the frequency (number of occurrences) of an ngram in Google's
        Web 1T 5-gram Corpus.
    """

    global build_entry, web1t_data_path

    length = len(surfaces)

    if length > 5:
        warn("Cannot count the frequency of an n-gram, n>5!")
        return 0

    search_term = ' '.join(map(build_entry, surfaces, lemmas, pos))

    # Find the file in which to look for the ngram.
    if length == 1:
        filename = web1t_data_path + "/1gms/vocab.gz"
    else:
        indexfile = web1t_data_path + "/%dgms/%dgm.idx" % (length, length)
        filenames = [x.split("\t") for x in read_file(indexfile).split("\n")]
        filename = None
        for (name, first) in filenames:
            # Assumes byte-value-based ordering!
            if first > search_term:
                break
            else:
                filename = name

        if filename is None:
            return 0
        filename = "%s/%dgms/%s" % (web1t_data_path, length, filename)

    verbose("WEB1T: Opening %s, looking for %s" % (filename, search_term))

    # This has been absurdly slow in Python.
    #file = gzip.open(filename, "rb")
    #
    #search_term += "\t"
    #freq = 0
    #
    #for line in file:
    #    if line.startswith(search_term):
    #        freq = int(line.split("\t")[1])
    #        break
    #
    #print >>sys.stderr, "buenito: %d" % freq
    #
    #file.close()

    file = subprocess.Popen(
        ["zgrep", "--", "^" + re.escape(search_term) + "\t", filename],
        stdout=subprocess.PIPE).stdout
    line = file.read()
    file.close()
    if line:
        freq = int(line.split("\t")[1])
    else:
        freq = 0
    verbose("freq =" + str(freq))
    return freq
Example #22
0
def get_freq_web1t(surfaces, lemmas, pos):
    """
        Gets the frequency (number of occurrences) of an ngram in Google's
        Web 1T 5-gram Corpus.
    """

    global build_entry, web1t_data_path

    length = len(surfaces)

    if length > 5:
        warn("Cannot count the frequency of an n-gram, n>5!")
        return 0

    search_term = ' '.join(map(build_entry, surfaces, lemmas, pos))

    # Find the file in which to look for the ngram.
    if length == 1:
        filename = web1t_data_path + "/1gms/vocab.gz"
    else:
        indexfile = web1t_data_path + "/%dgms/%dgm.idx" % (length, length)
        filenames = [x.split("\t") for x in read_file(indexfile).split("\n")]
        filename = None
        for (name, first) in filenames:
            # Assumes byte-value-based ordering!
            if first > search_term:
                break
            else:
                filename = name

        if filename is None:
            return 0
        filename = "%s/%dgms/%s" % (web1t_data_path, length, filename)

    verbose("WEB1T: Opening %s, looking for %s" % (filename, search_term))

    # This has been absurdly slow in Python.
    #file = gzip.open(filename, "rb")
    #
    #search_term += "\t"
    #freq = 0
    #
    #for line in file:
    #    if line.startswith(search_term):
    #        freq = int(line.split("\t")[1])
    #        break
    #
    #print >>sys.stderr, "buenito: %d" % freq
    #
    #file.close()

    file = subprocess.Popen(
        ["zgrep", "--", "^" + re.escape(search_term) + "\t", filename],
        stdout=subprocess.PIPE).stdout
    line = file.read()
    file.close()
    if line:
        freq = int(line.split("\t")[1])
    else:
        freq = 0
    verbose("freq =" + str(freq))
    return freq
Example #23
0
def calculate_ams(o, m_list, N, corpus_name):
    """
        Given a joint frequency of the ngram, a list of individual frequencies,
        a corpus size and a corpus name, generates a list of `Features`, each
        containing the value of an Association Measure.
        
        @param o The float value corresponding to the number of occurrences of 
        the ngram.
        
        @param m_list A list of float values corresponding to the number of 
        occurrences of each of the words composing the ngram. The list should
        NEVER be empty, otherwise the result is undefined.
        
        @param N The float value corresponding to the number of tokens in the
        corpus, i.e. its total size. The size of the corpus should NEVER be 
        zero, otherwise the result is undefined.
        
        @param corpus_name A string that uniquely identifies the corpus from
        which the counts were drawn.        
    """
    # N is never null!!!
    # m_list is never empty!!!
    global measures, heuristic_combine, not_normalize_mle, warn_ll_bigram_only
    feats = []
    f_sum = 0
    n = len(m_list)
    e = expect(m_list, N)
    if "mle" in measures:
        if not_normalize_mle:
            mle = int(o)
        else:
            mle = o / N
        feats.append(Feature("mle_" + corpus_name, mle))
    if "pmi" in measures:
        if e != 0 and o != 0:
            pmi = math.log(o / e, 2)
        else:
            pmi = 0.0
        feats.append(Feature("pmi_" + corpus_name, pmi))
    if "t" in measures:
        if o != 0.0:
            t = (o - e) / math.sqrt(o)
        else:
            t = 0.0
        feats.append(Feature("t_" + corpus_name, t))
    if "dice" in measures:
        if sum(m_list) != 0.0:
            dice = (n * o) / sum(m_list)
        else:
            dice = 0.0
        feats.append(Feature("dice_" + corpus_name, dice))
    if "ll" in measures:
        #pdb.set_trace()
        if len(m_list) == 2:
            # Contingency tables observed, expected
            (ct_os, ct_es) = contingency_tables([o], m_list, N, corpus_name)
            ll_list = []  # Calculation is suitable for generic ngrams
            for (ct_o, ct_e) in map(None, ct_os, ct_es):
                ll = 0.0
                for i in range(2):
                    for j in range(2):
                        if ct_o[i][j] != 0.0:

                            ll += ct_o[i][j] * (math.log(ct_o[i][j], 10) -
                                                math.log(ct_e[i][j], 10))
                ll *= 2
                ll_list.append(ll)
            ll_final = heuristic_combine(ll_list)
        else:
            if warn_ll_bigram_only:
                warn_ll_bigram_only = False
                warn("log-likelihood is only implemented for 2grams. "
                     "Defaults to 0.0 for n>2")
            ll_final = 0.0
        feats.append(Feature("ll_" + corpus_name, ll_final))
    return feats
Example #24
0
def contingency_tables( bigram_freqs, unigram_freqs, N, corpus_name ):
    """
        Given an ngram (generic n) w_1 ... w_n, the input is a couple of lists
        containing integer frequencies, the output is a couple of lists with
        contingency tables. The first list contains bigram frequencies
        [ f(w_1 w_2), f(w_2 w_3), ..., f(w_n-1 w_n) ]. The second list contains
        unigram frequencies [ f(w_1), f(w_2), ..., f(w_n) ]. While the first
        list contains n-1 elements, the second list contains n elements. The
        result is a couple of lists with contingency tables, the first
        corresponds to the observed frequencies, the second to expected
        frequencies. The contingency tables are 2D lists that contain the 4
        possible outcomes for the occurrence of a bigram, i.e. c(w1 w2),
        c(w1 ~w2), c(~w1 w2) and c(~w1 ~w2), where "~w" means "any word but w".
        Observed contingency tables are exact calculations based on simple
        set operations (intersection, difference). The expected frequencies are
        calculated using maximum likelihood for independent events (e.g. the
        occurrence of w1 does not change the probability of the occurrence of w2
        or of ~w2 imediately after w1, also noted P(w2|w1)=P(w2)).

        @param bigram_freqs List of integers representing bigram frequencies.
        Notice that no bigram can occur more than the words that is contains.
        Any inconsistency will be automatically corrected and output as a
        warning. This list should contain n-1 elements.

        @param unigram_freqs List of integers representing unigram (word)
        frequencies. This list should have n elements.

        @param corpus_name The name of the corpus from which frequencies were
        drawn. This is only used in verbose mode to provide friendly output
        messages.

        @return a couple (observed, expected), where observed and expected are
        lists, both of size n-1, and each cell of each list contains a 2x2 table
        with observed and expected contingency tables for the bigrams given as
        input.
    """
    observed = []
    expected = []
    n = len( unigram_freqs )
    if len( bigram_freqs ) != n - 1 :
        warn( "Invalid unigram/bigram frequencies passed to "
              "calculate_negations function")
        return None
    # 1) Verify that all the frequencies are valid
    for i in range( len( bigram_freqs ) ) :
        if bigram_freqs[ i ] > unigram_freqs[ i ] or \
           bigram_freqs[ i ] > unigram_freqs[ i + 1 ] :
            warn( corpus_name + " unigrams must occur at least as much as bigram.")
        if bigram_freqs[ i ] > unigram_freqs[ i ] :
            warn("Automatic correction: " + \
                                 str( unigram_freqs[ i ] ) + " -> " + \
                                 str( bigram_freqs[ i ] ))
            unigram_freqs[ i ] = bigram_freqs[ i ]
        if bigram_freqs[ i ] > unigram_freqs[ i + 1 ] :            
            warn("Automatic correction: " + \
                                 str( unigram_freqs[ i + 1 ] ) + " -> " + \
                                 str( bigram_freqs[ i ] ))
            unigram_freqs[ i + 1 ] = bigram_freqs[ i ]
    # 2) Calculate negative freqs 
    for i in range( len( bigram_freqs ) ) :        
        o = [ 2 * [ -1 ], 2 * [ -1 ] ]
        e = [ 2 * [ -1 ], 2 * [ -1 ] ]
        cw1 = unigram_freqs[ i ]
        cw2 = unigram_freqs[ i + 1 ]
        cw1w2 = bigram_freqs[ i ]        
        o[ 0 ][ 0 ] = cw1w2
        e[ 0 ][ 0 ] = expect( [ cw1, cw2 ], N )
        o[ 0 ][ 1 ] = cw1 - cw1w2
        e[ 0 ][ 1 ] = expect( [ cw1, N - n + 1 - cw2 ], N )
        o[ 1 ][ 0 ] = cw2 - cw1w2
        e[ 1 ][ 0 ] = expect( [ N - n + 1 - cw1, cw2 ], N )
        # BEWARE! THERE WAS A HUGE ERROR HERE, CORRECTED ON APRIL 18, 2012
        # ALL LOG-LIKELIHOOD VALUES CALCULATED BY THE TOOLKIT WERE WRONG!
        # PLEASE RE-RUN IF YOU USED THE OLD VERSION!
        o[ 1 ][ 1 ] = N - len( unigram_freqs )  + 1 - cw1 - cw2 + cw1w2 
        e[ 1 ][ 1 ] = expect( [ N - n + 1 - cw1, N - n + 1 - cw2 ], N )
        observed.append( o )
        expected.append( e )
    return (observed, expected)
Example #25
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global thresh_source
    global thresh_value
    global equals_name
    global equals_value
    global reverse
    global minlength
    global maxlength
    global min_mweoccurs
    global max_mweoccurs
    global input_filetype_ext
    global output_filetype_ext

    treat_options_simplest(opts, arg, n_arg, usage_string)

    for (o, a) in opts:
        if o in ("-t", "--threshold"):
            threshold = interpret_threshold(a)
            if threshold:
                (thresh_source, thresh_value) = threshold
            else:
                error("The format of the -t argument must be <source>:"
                      "<value>\n<source> must be a valid corpus name and "
                      "<value> must be a non-negative integer")
        elif o in ("-e", "--equals"):
            equals = interpret_equals(a)
            if equals:
                (equals_name, equals_value) = equals
            else:
                error("The format of the -e argument must be <name>:"
                      "<value>\n<name> must be a valid feat name and "
                      "<value> must be a non-empty string")

        elif o in ("-p", "--patterns"):
            verbose("Reading patterns file")
            global patterns
            patterns = filetype.parse_entities([a])
        elif o in ("-r", "--reverse"):
            reverse = True
            verbose("Option REVERSE active")

        elif o in ("-i", "--minlength"):
            minlength = interpret_length(a, "minimum")
        elif o in ("-a", "--maxlength"):
            maxlength = interpret_length(a, "maximum")
        elif o == "--min-mweoccurs":
            min_mweoccurs = interpret_length(a, "minimum")
        elif o == "--max-mweoccurs":
            max_mweoccurs = interpret_length(a, "maximum")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if minlength > maxlength:
        warn("minlength should be <= maxlength")
    if min_mweoccurs > max_mweoccurs:
        warn("min-mweoccurs should be <= max-mweoccurs")
Example #26
0
def treat_options(opts, arg, n_arg, usage_string):
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global patterns
    global ignore_pos
    global surface_instead_lemmas
    global print_cand_freq
    global print_source
    global match_distance
    global non_overlapping
    global input_filetype_ext
    global output_filetype_ext
    global id_order

    treat_options_simplest(opts, arg, n_arg, usage_string)

    mode = []
    patterns_file = None
    for (o, a) in opts:
        if o in ("-p", "--patterns"):
            mode.append("patterns")
            patterns_file = a
        elif o in ("-n", "--ngram"):
            create_patterns_file(a)
            mode.append("ngram")
        elif o in ("-g", "--ignore-pos"):
            ignore_pos = True
        elif o in ("-d", "--match-distance"):
            match_distance = a
        elif o in ("-N", "--non-overlapping"):
            non_overlapping = True
        elif o in ("-s", "--surface"):
            surface_instead_lemmas = True
        elif o in ("-S", "--source"):
            print_source = True
        elif o in ("-f", "--freq"):
            print_cand_freq = True
        elif o in ("-i", "--index"):
            input_filetype_ext = "BinaryIndex"
            warn("Option -i is deprecated; use --from=BinaryIndex")
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad flag")

    if non_overlapping and match_distance == "All":
        # If we are taking all matches, we need to be able to overlap...
        error(
            "Conflicting options: --match-distance=All and --non-overlapping")

    if len(mode) != 1:
        error("Exactly one option, -p or -n, must be provided")
    if "patterns" in mode:
        global patterns
        patterns = filetype.parse_entities([patterns_file])
Example #27
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global patterns
    global ignore_pos
    global surface_instead_lemmas
    global print_cand_freq
    global print_source
    global match_distance
    global non_overlapping
    global input_filetype_ext
    global output_filetype_ext
    global id_order
    
    treat_options_simplest( opts, arg, n_arg, usage_string )
        
    mode = []
    patterns_file = None
    for ( o, a ) in opts:
        if o in ("-p", "--patterns") : 
            mode.append( "patterns" )
            patterns_file = a
        elif o in ( "-n", "--ngram" ) :
            create_patterns_file( a )
            mode.append( "ngram" )
        elif o in ("-g", "--ignore-pos") : 
            ignore_pos = True
        elif o in ("-d", "--match-distance") : 
            match_distance = a
        elif o in ("-N", "--non-overlapping") : 
            non_overlapping = True
        elif o in ("-s", "--surface") : 
            surface_instead_lemmas = True
        elif o in ("-S", "--source") :
            print_source = True
        elif o in ("-f", "--freq") : 
            print_cand_freq = True
        elif o in ("-i", "--index") :
            input_filetype_ext = "BinaryIndex"
            warn("Option -i is deprecated; use --from=BinaryIndex")
        elif o == "--id-order":
            id_order = a.split(":")
        elif o == "--from" :
            input_filetype_ext = a
        elif o == "--to" :
            output_filetype_ext = a
        else:
            raise Exception("Bad flag")

    if non_overlapping and match_distance == "All":
        # If we are taking all matches, we need to be able to overlap...
        error("Conflicting options: --match-distance=All and --non-overlapping")

    if len(mode) != 1 :
        error("Exactly one option, -p or -n, must be provided")
    if "patterns" in mode:
        global patterns
        patterns = filetype.parse_entities([patterns_file])
Example #28
0
def calculate_ams( o, m_list, N, corpus_name ) :
    """
        Given a joint frequency of the ngram, a list of individual frequencies,
        a corpus size and a corpus name, generates a list of `Features`, each
        containing the value of an Association Measure.
        
        @param o The float value corresponding to the number of occurrences of 
        the ngram.
        
        @param m_list A list of float values corresponding to the number of 
        occurrences of each of the words composing the ngram. The list should
        NEVER be empty, otherwise the result is undefined.
        
        @param N The float value corresponding to the number of tokens in the
        corpus, i.e. its total size. The size of the corpus should NEVER be 
        zero, otherwise the result is undefined.
        
        @param corpus_name A string that uniquely identifies the corpus from
        which the counts were drawn.        
    """
    # N is never null!!!
    # m_list is never empty!!!
    global measures, heuristic_combine, not_normalize_mle, warn_ll_bigram_only
    feats = []
    f_sum = 0
    n = len( m_list )
    e = expect( m_list, N )
    if "mle" in measures :
        if not_normalize_mle :
            mle = int( o )
        else :
            mle = o / N
        feats.append( Feature( "mle_" + corpus_name, mle ) )
    if "pmi" in measures :
        if e != 0 and o != 0:
            pmi = math.log( o / e, 2 )
        else :
            pmi = 0.0
        feats.append( Feature( "pmi_" + corpus_name, pmi ) )
    if "t" in measures :
        if o != 0.0 :
            t = ( o - e ) / math.sqrt( o )
        else :
            t = 0.0
        feats.append( Feature( "t_" + corpus_name, t ) )
    if "dice" in measures :
        if sum( m_list ) != 0.0 :
            dice = ( n * o ) / sum( m_list )
        else :
            dice = 0.0
        feats.append( Feature( "dice_" + corpus_name, dice ) )
    if "ll" in measures :
        #pdb.set_trace()
        if len( m_list ) == 2 :
            # Contingency tables observed, expected
            ( ct_os, ct_es ) = contingency_tables( [o], m_list, N, corpus_name )
            ll_list  = [] # Calculation is suitable for generic ngrams
            for (ct_o, ct_e) in map( None, ct_os, ct_es ) :
                ll = 0.0
                for i in range( 2 ) :
                    for j in range( 2 ) :
                        if ct_o[i][j] != 0.0 :
                            
                            ll += ct_o[i][j] * ( math.log( ct_o[i][j], 10 ) -
                                                 math.log( ct_e[i][j], 10 ) )
                ll *= 2
                ll_list .append( ll )
            ll_final = heuristic_combine( ll_list )
        else :
            if warn_ll_bigram_only:
                warn_ll_bigram_only = False
                warn("log-likelihood is only implemented for 2grams. "
                     "Defaults to 0.0 for n>2")
            ll_final = 0.0
        feats.append( Feature( "ll_" + corpus_name, ll_final ) )
    return feats
Example #29
0
def treat_options( opts, arg, n_arg, usage_string ) :
    """
        Callback function that handles the command line options of this script.
        
        @param opts The options parsed by getopts. Ignored.
        
        @param arg The argument list parsed by getopts.
        
        @param n_arg The number of arguments expected for this script.    
    """
    global thresh_source
    global thresh_value
    global equals_name
    global equals_value
    global reverse
    global minlength
    global maxlength
    global min_mweoccurs
    global max_mweoccurs
    global input_filetype_ext
    global output_filetype_ext
    
    treat_options_simplest( opts, arg, n_arg, usage_string )    
    
    for ( o, a ) in opts:
        if o in ( "-t", "--threshold" ) : 
            threshold = interpret_threshold( a )
            if threshold :
                (thresh_source, thresh_value) = threshold
            else :
                error( "The format of the -t argument must be <source>:"
                       "<value>\n<source> must be a valid corpus name and "
                       "<value> must be a non-negative integer")
        elif o in ( "-e", "--equals" ) :
            equals = interpret_equals( a )
            if equals :
                ( equals_name, equals_value ) = equals
            else :
                error( "The format of the -e argument must be <name>:"
                       "<value>\n<name> must be a valid feat name and "
                       "<value> must be a non-empty string")

        elif o in ("-p", "--patterns") :
            verbose( "Reading patterns file" )
            global patterns
            patterns = filetype.parse_entities([a])
        elif o in ("-r", "--reverse") :
            reverse = True
            verbose("Option REVERSE active")

        elif o in ("-i", "--minlength") :
            minlength = interpret_length( a, "minimum" )
        elif o in ("-a", "--maxlength") :
            maxlength = interpret_length( a, "maximum" )
        elif o == "--min-mweoccurs":
            min_mweoccurs = interpret_length(a, "minimum")
        elif o == "--max-mweoccurs":
            max_mweoccurs = interpret_length(a, "maximum")
        elif o == "--from":
            input_filetype_ext = a
        elif o == "--to":
            output_filetype_ext = a
        else:
            raise Exception("Bad arg: " + o)

    if minlength > maxlength:
        warn("minlength should be <= maxlength")
    if min_mweoccurs > max_mweoccurs:
        warn("min-mweoccurs should be <= max-mweoccurs")
Example #30
0
def contingency_tables(bigram_freqs, unigram_freqs, N, corpus_name):
    """
        Given an ngram (generic n) w_1 ... w_n, the input is a couple of lists
        containing integer frequencies, the output is a couple of lists with
        contingency tables. The first list contains bigram frequencies
        [ f(w_1 w_2), f(w_2 w_3), ..., f(w_n-1 w_n) ]. The second list contains
        unigram frequencies [ f(w_1), f(w_2), ..., f(w_n) ]. While the first
        list contains n-1 elements, the second list contains n elements. The
        result is a couple of lists with contingency tables, the first
        corresponds to the observed frequencies, the second to expected
        frequencies. The contingency tables are 2D lists that contain the 4
        possible outcomes for the occurrence of a bigram, i.e. c(w1 w2),
        c(w1 ~w2), c(~w1 w2) and c(~w1 ~w2), where "~w" means "any word but w".
        Observed contingency tables are exact calculations based on simple
        set operations (intersection, difference). The expected frequencies are
        calculated using maximum likelihood for independent events (e.g. the
        occurrence of w1 does not change the probability of the occurrence of w2
        or of ~w2 imediately after w1, also noted P(w2|w1)=P(w2)).

        @param bigram_freqs List of integers representing bigram frequencies.
        Notice that no bigram can occur more than the words that is contains.
        Any inconsistency will be automatically corrected and output as a
        warning. This list should contain n-1 elements.

        @param unigram_freqs List of integers representing unigram (word)
        frequencies. This list should have n elements.

        @param corpus_name The name of the corpus from which frequencies were
        drawn. This is only used in verbose mode to provide friendly output
        messages.

        @return a couple (observed, expected), where observed and expected are
        lists, both of size n-1, and each cell of each list contains a 2x2 table
        with observed and expected contingency tables for the bigrams given as
        input.
    """
    observed = []
    expected = []
    n = len(unigram_freqs)
    if len(bigram_freqs) != n - 1:
        warn("Invalid unigram/bigram frequencies passed to "
             "calculate_negations function")
        return None
    # 1) Verify that all the frequencies are valid
    for i in range(len(bigram_freqs)):
        if bigram_freqs[ i ] > unigram_freqs[ i ] or \
           bigram_freqs[ i ] > unigram_freqs[ i + 1 ] :
            warn(corpus_name +
                 " unigrams must occur at least as much as bigram.")
        if bigram_freqs[i] > unigram_freqs[i]:
            warn("Automatic correction: " + \
                                 str( unigram_freqs[ i ] ) + " -> " + \
                                 str( bigram_freqs[ i ] ))
            unigram_freqs[i] = bigram_freqs[i]
        if bigram_freqs[i] > unigram_freqs[i + 1]:
            warn("Automatic correction: " + \
                                 str( unigram_freqs[ i + 1 ] ) + " -> " + \
                                 str( bigram_freqs[ i ] ))
            unigram_freqs[i + 1] = bigram_freqs[i]
    # 2) Calculate negative freqs
    for i in range(len(bigram_freqs)):
        o = [2 * [-1], 2 * [-1]]
        e = [2 * [-1], 2 * [-1]]
        cw1 = unigram_freqs[i]
        cw2 = unigram_freqs[i + 1]
        cw1w2 = bigram_freqs[i]
        o[0][0] = cw1w2
        e[0][0] = expect([cw1, cw2], N)
        o[0][1] = cw1 - cw1w2
        e[0][1] = expect([cw1, N - n + 1 - cw2], N)
        o[1][0] = cw2 - cw1w2
        e[1][0] = expect([N - n + 1 - cw1, cw2], N)
        # BEWARE! THERE WAS A HUGE ERROR HERE, CORRECTED ON APRIL 18, 2012
        # ALL LOG-LIKELIHOOD VALUES CALCULATED BY THE TOOLKIT WERE WRONG!
        # PLEASE RE-RUN IF YOU USED THE OLD VERSION!
        o[1][1] = N - len(unigram_freqs) + 1 - cw1 - cw2 + cw1w2
        e[1][1] = expect([N - n + 1 - cw1, N - n + 1 - cw2], N)
        observed.append(o)
        expected.append(e)
    return (observed, expected)