예제 #1
0
    def em_step(self,
                corpus,
                parser_class,
                normalization_groups,
                bitext=False):
        """ 
        Perform a single step of EM on the 
        """
        ll = 0.0

        counts = defaultdict(float)

        parser = parser_class(self)
        if bitext:
            if parser_class == ParserTD:
                log.err(
                    "Bigraph parsing with tree decomposition based parser is not yet implemented. Use '-p basic'."
                )
                sys.exit(1)
            parse_generator = parser.parse_bitexts(corpus)
        else:
            if self.rhs1_type == "string":
                if parser_class == ParserTD:
                    log.err(
                        "Parser class needs to be 'basic' to parse strings.")
                    sys.exit(1)
                else:
                    parse_generator = parser.parse_strings(corpus)
            else:
                parse_generator = parser.parse_graphs(corpus)

        i = 0
        for chart in parse_generator:
            i += 1
            if not chart:
                log.warn("No parse for sentence %d." % i)
                continue
            inside_probs = chart.inside_scores()
            outside_probs = chart.outside_scores(inside_probs)
            ll += inside_probs["START"]
            counts_for_graph = chart.expected_rule_counts(
                inside_probs, outside_probs)
            for r in counts_for_graph:
                counts[r] = counts[r] + counts_for_graph[r]

        for r in counts:
            if r in counts:
                self[r].weight = counts[r]
            else:
                self[r].weight = LOGZERO

        self.normalize_by_groups(normalization_groups)

        return ll
예제 #2
0
파일: grammar.py 프로젝트: ChenluJi/bolinas
    def em_step(self, corpus, parser_class, normalization_groups, bitext = False):
        """ 
        Perform a single step of EM on the 
        """
        ll = 0.0

        counts = defaultdict(float)

        parser = parser_class(self)
        if bitext: 
            if parser_class == ParserTD:
                log.err("Bigraph parsing with tree decomposition based parser is not yet implemented. Use '-p basic'.")
                sys.exit(1)
            parse_generator = parser.parse_bitexts(corpus)
        else: 
            if self.rhs1_type == "string":
                if parser_class == ParserTD:
                    log.err("Parser class needs to be 'basic' to parse strings.")
                    sys.exit(1)
                else: 
                    parse_generator = parser.parse_strings(corpus)
            else: 
                parse_generator = parser.parse_graphs(corpus)
        
        i = 0
        for chart in parse_generator:
            i += 1   
            if not chart: 
                log.warn("No parse for sentence %d." % i)
                continue 
            inside_probs = chart.inside_scores()
            outside_probs = chart.outside_scores(inside_probs)
            ll += inside_probs["START"]
            counts_for_graph = chart.expected_rule_counts(inside_probs, outside_probs)
            for r in counts_for_graph:
                counts[r] = counts[r] + counts_for_graph[r]
      
        for r in counts: 
            if r in counts: 
                self[r].weight = counts[r]
            else: 
                self[r].weight = LOGZERO 
       
        self.normalize_by_groups(normalization_groups) 

        return ll 
예제 #3
0
def downUrl(url, dirPath=None):
    urlFile = requests.get(url)

    fileNames = url.split("/")
    fileName = fileNames[len(fileNames) - 1]

    if dirPath == None:
        dirPath = path.getProjectPath() + "down"

    if not os.path.exists(dirPath):
        os.makedirs(dirPath)

    savePath = dirPath + "\\" + fileName

    if os.path.exists(savePath):
        log.err("had:" + savePath)
        return

    print("down:" + url)
    with open(savePath, "wb") as code:
        code.write(urlFile.content)
예제 #4
0
        "-v",
        "--verbose",
        type=int,
        default=2,
        help=
        "Stderr output verbosity: 0 (all off), 1 (warnings), 2 (info, default), 3 (details), 3 (debug)"
    )

    args = argparser.parse_args()

    # Verify command line parameters
    if not args.output_type in [
            'forest', 'derivation', 'derived', 'yield', 'both'
    ]:
        log.err(
            "Output type (-ot) must be either 'forest', 'derivation', or 'derived'."
        )
        sys.exit(1)

    if not args.weight_type in ['prob', 'logprob']:
        log.err("Weight type (-m) must be either 'prob'or 'logprob'.")
        sys.exit(1)

    logprob = (args.weight_type == 'logprob')

    if args.output_type == "forest":
        if not args.output_file:
            log.err(
                "Need to provide '-o FILE_PREFIX' with output type 'forest'.")
            sys.exit(1)
        if args.k:
예제 #5
0
파일: bolinas.py 프로젝트: jimwhite/bolinas
    #weights.add_argument("-d","--randomize", default=False, action="store_true", help="Randomize weights to be distributed between 0.2 and 0.8. Useful for EM training.")
    weights.add_argument("-n","--normalize", default=False, action="store_true", help="Normalize weights. If -b is specified, rules with the same LHS sum up to 1.0. If -f is specified rules with the same LHS and second RHS sum up to 1.0. If -r is specified rules with the same LHS and first RHS sum up to 1.0.") 
    weights.add_argument("-t","--train", default=0, type=int, const=5, nargs='?', help="Use TRAIN iterations of EM to train weights for the grammar using the input (graph, string, or pairs of objects in alternating lines). Initialize with the weights in the grammar file or with uniform weights if none are provided. Writes a grammar file with trained weights to the output.")
    argparser.add_argument("-m", "--weight_type", default="prob", help="Input/output in real probabilities ('prob', default) or log probabilities ('logprob').")
    argparser.add_argument("-p","--parser", default="basic", help="Specify which graph parser to use. 'td': the tree decomposition parser of Chiang et al, ACL 2013 (default). 'basic': a basic generalization of CKY that matches rules according to an arbitrary visit order on edges (less efficient).")
    argparser.add_argument("-e","--edge_labels", action="store_true", default=False, help="Consider only edge labels when matching HRG rules. By default node labels need to match. Warning: The default is potentially unsafe when node-labels are used for non-leaf nodes on the target side of a synchronous grammar.")
    argparser.add_argument("-bn","--boundary_nodes", action="store_true", help="In the tree decomposition parser, use the full representation for graph fragments instead of the compact boundary node representation. This can provide some speedup for grammars with small rules.")
    #argparser.add_argument("-s","--remove_spurious", default=False, action="store_true", help="Remove spurious ambiguity. Only keep the best derivation for identical derived objects.")
    argparser.add_argument("-s","--start_symbol", default=None, type=str, help="Use this start symbol instead of the left hand side of the first rule in the grammar.")
    argparser.add_argument("-v","--verbose", type=int, default=2, help="Stderr output verbosity: 0 (all off), 1 (warnings), 2 (info, default), 3 (details), 3 (debug)")
    
    args = argparser.parse_args()
    
    # Verify command line parameters 
    if not args.output_type in ['forest', 'derivation', 'derived', 'yield', 'both']:
        log.err("Output type (-ot) must be either 'forest', 'derivation', or 'derived'.")
        sys.exit(1)
    
    if not args.weight_type in ['prob', 'logprob']:
        log.err("Weight type (-m) must be either 'prob'or 'logprob'.")
        sys.exit(1)

    logprob = (args.weight_type == 'logprob')


    if args.output_type == "forest":
        if not args.output_file:       
            log.err("Need to provide '-o FILE_PREFIX' with output type 'forest'.")
            sys.exit(1)
        if args.k:
            log.warn("Ignoring -k command line option because output type is 'forest'.")