def create_UD_treebank_list(self,options): """ Create list of UD Treebanks for experiments. Output will either be a list where each element is a single treebank (monolingual or multi-monolingual case) or a list where the first element is a list of treebanks (multilingual case). This makes it easier to loop over the outer list in our main parser function """ options.conllu = True # file is in conllu format all_treebanks = utils.get_all_treebanks(options) # returns a UD treebank for all possible UD languages treebank_dict = {treebank.iso_id: treebank for treebank in all_treebanks} treebanks = [] # the treebanks we need iso_list = utils.parse_list_arg(options.include) # languages requested by the user via the include flag for iso in iso_list: proxy_tbank = None m = re.search(r'^(.*):(.*)$',iso) if m: iso = m.group(1) proxy_tbank = m.group(2) if iso in treebank_dict: treebank = treebank_dict[iso] treebank.proxy_tbank = proxy_tbank if not options.shared_task: treebank.outdir= os.path.join(options.outdir,treebank.iso_id) else: treebank.outdir = options.outdir if not os.path.exists(treebank.outdir): # create language-specific output folder if it doesn't exist print "Creating language-specific output directory " + treebank.outdir os.mkdir(treebank.outdir) else: print ("Warning: language-specific subdirectory " + treebank.outdir + " already exists, contents may be overwritten") if not options.predict: self.prepareDev(treebank,options) if options.debug: # it is important that prepareDev be called before createDebugData self.createDebugData(treebank,options) if options.predict and not options.multiling: treebank.modeldir = os.path.join(options.modeldir,treebank.iso_id) model = os.path.join(treebank.modeldir,options.model) if not os.path.exists(model): raise Exception("Model not found. Path tried: %s"%model) else: treebank.modeldir = None treebanks.append(treebank) else: print "Warning: skipping invalid language code " + iso return treebanks
if __name__ == "__main__": parser = OptionParser() parser.add_option("--include", metavar="LIST", help="List of languages by ISO code to be run \ if using UD. If not specified need to specify trainfile at least. When used in combination with \ --multiling, trains a common parser for all languages. Otherwise, train monolingual parsers for \ each") parser.add_option( "--datadir", metavar="PATH", help= "Input directory with UD train/dev/test files; obligatory if using --include" ) (options, args) = parser.parse_args() #ugly but necessary options.shared_task = False options.golddir = None iso_ids = utils.parse_list_arg(options.include) iso_dict = utils.load_iso_dict() treebank_metadata = [(name, iso_id) for (name, iso_id) in iso_dict.items() if iso_id in iso_ids] treebanks = [utils.UDtreebank(ele, options) for ele in treebank_metadata] for treebank in treebanks: #get_stats([treebank]) get_stats_c([treebank])
def __init__(self, options): """ input: parser options object to harmonise the way we deal with the parser """ print 'Using external embedding:', options.external_embedding self.deal_with_multiling(options) if options.include and not options.datadir: raise Exception( "You need to specify the data dir to include UD languages") if not options.predict: if not options.include and not options.trainfile: raise Exception( "If not using the --include option, you must specify your training data with --trainfile" ) else: if not options.include and not options.testfile: raise Exception( "If not using the --include option, you must specify your test data with --testfile" ) if not options.modeldir: options.modeldir = options.outdir # set model directory to output directory by default if not options.outdir: raise Exception( "You must specify an output directory via the --outdir option") elif not os.path.exists( options.outdir): # create output directory if it doesn't exist print "Creating output directory " + options.outdir os.mkdir(options.outdir) if not options.predict and not (options.rlFlag or options.rlMostFlag or options.headFlag): raise Exception( "Must include either head, rl or rlmost (For example, if you specified --disable-head and --disable-rlmost, you must specify --userl)" ) if options.rlFlag and options.rlMostFlag: print 'Warning: Switching off rlMostFlag to allow rlFlag to take precedence' options.rlMostFlag = False #TODO: maybe add more sanity checks #this is now useless options.drop_nproj = False options.multi_monoling = False # set default self.iterations = 1 # set default self.conllu = True #default if not options.include: # must specifiy explicitly train treebank = utils.Treebank(options.trainfile, \ options.devfile, options.testfile) treebank.iso_id = None treebank.outdir = options.outdir treebank.modeldir = options.modeldir #just one model specified by train/dev and/or test if options.predict: if not options.testfile: raise Exception("--testfile must be specified") elif not os.path.exists(options.testfile): raise Exception("Test file " + options.testfile + " not found") else: self.conllu = (os.path.splitext( options.testfile.lower())[1] == '.conllu' ) # test if file in conllu format treebank.test_gold = options.testfile else: self.prepareDev(treebank, options) if options.devfile: self.conllu = (os.path.splitext( options.devfile.lower())[1] == '.conllu') elif options.create_dev: self.conllu = (os.path.splitext( options.trainfile.lower())[1] == '.conllu') if options.debug: self.createDebugData(treebank, options) self.languages = [ treebank ] # make it a list of one element just for the sake of consistency with the "include" case else: self.conllu = True # file is in conllu format language_list = utils.parse_list_arg( options.include ) # languages requested by the user via the include flag json_treebanks = utils.conll_dir_to_list( language_list, options.datadir, options.shared_task, # list of the available treebanks options.shared_task_datadir) # self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list] treebank_dict = {lang.iso_id: lang for lang in json_treebanks} self.languages = [] for lang in language_list: if lang in treebank_dict: self.languages.append(treebank_dict[lang]) else: print "Warning: skipping invalid language code " + lang if options.multiling: if options.predict: model = "%s/%s" % (options.modeldir, options.model) if not os.path.exists( model ): # in multilingual case need model to be found in first language specified raise Exception("Model not found. Path tried: %s" % model) # if options.model_selection: # can only do model selection for monolingual case # print "Warning: model selection on dev data not available for multilingual case" # options.model_selection = False else: options.multi_monoling = True self.iterations = len(self.languages) for lang_index in xrange(len(self.languages)): language = self.languages[lang_index] language.outdir = "%s/%s" % (options.outdir, language.iso_id) if not os.path.exists( language.outdir ): # create language-specific output folder if it doesn't exist print "Creating language-specific output directory " + language.outdir os.mkdir(language.outdir) else: print("Warning: language-specific subdirectory " + language.outdir + " already exists, contents may be overwritten") if not options.predict: self.prepareDev(language, options) if options.debug: # it is important that prepareDev be called before createDebugData self.createDebugData(language, options) if options.predict and options.multi_monoling: language.modeldir = "%s/%s" % (options.modeldir, language.iso_id) model = "%s/%s" % (language.modeldir, options.model) if not os.path.exists( model ): # in multilingual case need model to be found in first language specified if not options.shared_task: raise Exception("Model not found. Path tried: %s" % model) else: #find model for the language in question for otherl in json_treebanks: if otherl.lcode == language.lcode: if otherl.lcode == otherl.iso_id: language.modeldir = "%s/%s" % ( options.modeldir, otherl.iso_id)
def __init__(self,options): """ input: parser options object to harmonise the way we deal with the parser """ if options.include and not options.datadir: raise Exception("You need to specify the data dir to include UD\ languages") #TODO: maybe add more sanity checks if not options.predictFlag and not (options.rlFlag or options.rlMostFlag or options.headFlag): raise Exception("You must use either --userlmost or --userl or\ --usehead (you can use multiple)") #the diff between two is one is r/l/most child / the other is #element in the sentence #Eli's paper: #extended feature set # rightmost and leftmost modifiers of s0, s1 and s2 + leftmost # modifier of b0 if not options.include: #just one model specified by train/dev and/or test if options.predictFlag: self.conllu = (os.path.splitext(options.conll_test.lower())[1] == '.conllu') else: self.conllu = (os.path.splitext(options.conll_dev.lower())[1] == '.conllu') self.treebank = utils.Treebank(options.conll_train, \ options.conll_dev, options.conll_test) self.treebank.iso_id = None else: self.conllu = True language_list = utils.parse_list_arg(options.include) json_treebanks = utils.conll_dir_to_list(language_list,options.datadir,options.shared_task, options.shared_task_datadir) self.languages = [lang for lang in json_treebanks if lang.iso_id in language_list] for language in self.languages: language.removeme = False language.outdir= "%s/%s"%(options.output,language.iso_id) language.modelDir= "%s/%s"%(options.modelDir,language.iso_id) model = "%s/%s"%(language.modelDir,options.model) if options.predictFlag and not os.path.exists(model): if not options.shared_task: raise Exception("Model not found. Path tried: %s"%model) else: #find model for the language in question for otherl in json_treebanks: if otherl.lcode == language.lcode: if otherl.lcode == otherl.iso_id: language.modelDir = "%s/%s"%(options.modelDir,otherl.iso_id) if not os.path.exists(language.outdir): os.mkdir(language.outdir) for language in self.languages: if language.removeme: self.languages.remove(language) if options.include and not options.multiling: options.multi_monoling = True self.iterations = len(self.languages) else: options.multi_monoling = False self.iterations = 1 #this is now useless options.drop_proj = False