def process_json(args): sin = openf(args.json_fn) json_in = load(fp=sin, encoding='utf-8') sentences_in = json_in['sentences'] try: json_out = parse(lang, [sanitized(s['ctext']) for s in sentences_in]) except KeyError: # from nltk.tokenize import TreebankWordTokenizer # _tokenize = TreebankWordTokenizer().tokenize _j = u' '.join def make_ctext(s): # this has side effects ctext = sanitized(s['text']) s['ctext'] = ctext return ctext json_out = parse(lang, [make_ctext(s) for s in sentences_in]) json_out.update((k, v) for k, v in json_in.items() if k != 'sentences') # Sanity check: verify we haven't modified ctext if False: for idx, (sent_in, sent_out) in enumerate(zip(json_in['sentences'], json_out['sentences']), start=1): ctext_in, ctext_out = sent_in['ctext'], sent_out['ctext'] try: assert ctext_in == ctext_out except AssertionError: dprint(u'error at line {}:\n {} \n!=\n {}'.format( idx, ctext_in, ctext_out)) output(json_out)
def create_read_iterator(self, line_num=10000): ''' Read line_num lines for a xml file. Converts differently. : line(int, optional): number of lines to read. -1 read whole file ''' # read the file, create the xml count = 0 l = [] with open(self.file_name, 'r') as f: for i,line in enumerate(f): try: l.append(dict(BeautifulSoup(line).row.attrs)) count +=1 except: dprint('Error on line {}'.format(line)) # check if the lines are fetched already if count>=line_num and line_num>0: dprint('Processed {}'.format(i)) yield DataFrame(l) l = [] count = 0 # in case it is reading all or finished the loop in the middle if count != 0: yield DataFrame(l) return
def cross_validate(classifier, top_words, nonnull=False): """Given the most common words in the Spanish corpus, cross-validate our classifiers for each of those.""" ## return a map from word to [(ncorrect,size)] out = defaultdict(list) util.dprint("cross validating this many words:", len(top_words)) for w in top_words: util.dprint("cross validating:", w) training = trainingdata.trainingdata_for(w, nonnull=nonnull) labels = set(label for (feat,label) in training) if len(labels) < 2: continue if len(training) < 10: print("not enough samples for", w) continue ## using constant random_state of 0 for reproducibility cv = cross_validation.KFold(len(training), n_folds=10, shuffle=False, random_state=0) for traincv, testcv in cv: mytraining = [training[i] for i in traincv] mytesting = [training[i] for i in testcv] mytraining = mytraining + [({"absolutelynotafeature":True}, "absolutelynotalabel")] classifier.train(mytraining) ncorrect = count_correct(classifier, mytesting) out[w].append((ncorrect,len(mytesting))) return out
def lm(sentence, relation, seed, noun_l, verb_l): """Outputs a LM with all the required keys. """ def offset(lemma, idx): "Finds offest of <lemma> in sentence." words = tokenizer(sentence) try: w = words[idx] word = w[0] if len(w) == 2 else w start = sentence.find(word) return dict(start=start, end=start + len(word)) except IndexError: dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:') dpprint((idx, words)) return dict(start=-1, end=-1) def dom(word, rel): return dict(offset(word.form, word.idx), lpos=u'{0[0]}.{0[1]}'.format(word.lemma), lemma=word.lemma[0], form=word.form[0], rel=rel) n_rel, v_rel = relation.split('-') noun, verb = rels[noun_l, verb_l] dprint('lm:', '\n noun', noun, '\n verb', verb) return dict(name=u'{0[0]} {1[0]}'.format(noun.lemma, verb.lemma), target=dom(noun, n_rel), source=dom(verb, v_rel), seed=u' '.join(u'%s.%s' % s for s in seed))
def split_files_by_year(self): ''' Get the file and split it by year. Get the first line year and the last line year as delta. Uses tail to do that ''' # get the first 3 lines head = subprocess.Popen(["head", "-n 3", self.file_name], stdout=subprocess.PIPE).communicate()[0] end = subprocess.Popen(["tail", "-n 2", self.file_name], stdout=subprocess.PIPE).communicate()[0] dic_list = [] xml = ElementTree.fromstring(head+end) for node in xml.iter('row'): dic_list.append(dict(zip(node.attrib.keys(), node.attrib.values()))) # get only the year of the string 2008-01-01 start_date = int(dic_list[0]['CreationDate'][0:4]) end_date = int(dic_list[1]['CreationDate'][0:4]) # file name we are reading base = basename(self.file_name) # for each of the years create a file and cat the content there process = [] out_list = [] num_processes = 0 for y in range(start_date, end_date+1): out = base.replace('.', '{}.'.format(y)) out = abspath(self.file_name).replace(base, out) f_out = open(out, 'w') out_list.append(f_out) dprint('Processing years {}'.format(y)) process.append(subprocess.Popen(['egrep', 'CreationDate="{}'.format(y), self.file_name], stdout=f_out)) #f_out.write(head.split('\n')[0]) #f_out.write(head.split('\n')[1]) num_processes += 1 # check if the processes are finished while True: num_finished = 0 for p in process: if p.poll() == 0: num_finished += 1 if num_finished == num_processes: break # give the processor some breathing time sleep(5) # close files for o in out_list: o.close() dprint('Finished creating {} files'.format(num_processes)) return (start_date, end_date)
def shared_verseids(bible1, bible2): """Given two hash tables, return the set of keys present in both.""" keys1 = set(bible1.keys()) keys2 = set(bible2.keys()) both = keys1.intersection(keys2) util.dprint("intersection has %0.2f%% from keys1, %0.2f%% from keys2" % (len(both) / len(keys1) * 100, len(both) / len(keys2) * 100)) return both
def fix_verseid(verseid, versetext): if verseid.count('.') != 2: util.dprint("{0}\t{1}".format(verseid, versetext)) return None book, chapter, verse = verseid.split(".") book = booknames.code(book) assert book in booknames.knownbooks() return "{0}_{1}_{2}".format(book,chapter,verse)
def end(idx): i = idx - 1 try: return ss.index(ctext[i]) + len(ctext[i]) except ValueError: dprint(u"can't find {}-th '{}' in '{}'".format( i, ctext[i], ss)) return -1
def remove_speaker_annotations(book, chapter, verse, text): """Given the text of a verse, take out the \\sp SPEAKER annotations.""" out = re.sub(SPEAKER_PATTERN, "", text) if out != text: matched = re.search(SPEAKER_PATTERN, text) speaker = matched.group(0) util.dprint("Stripped speaker identification: '{0}' in '{1}'".format( speaker, (chapter,verse,text))) return out
def cross_validate(classifier, top_words, nonnull=False): """Given the most common words in the Spanish corpus, cross-validate our classifiers for each of those.""" ## return a map from word to [(ncorrect,size)] out = defaultdict(list) util.dprint("cross validating this many words:", len(top_words)) for w in top_words: util.dprint("cross validating:", w) doc2vec_labels = trainingdata.doc2vec_labels(w, FEATUREPREFIX, nonnull=nonnull) training = [] for d2v_string, label in doc2vec_labels: sent_vector = np.array([float(x) for x in d2v_string.split("_")]) training.append((sent_vector, label)) print("this many instances for {0}: {1}".format(w, len(training))) labels = set(label for (feat,label) in training) if len(labels) < 2: continue if len(training) < 10: print("not enough samples for", w) continue ## using constant random_state of 0 for reproducibility cv = cross_validation.KFold(len(training), n_folds=10, shuffle=False, random_state=0) for traincv, testcv in cv: mytraining = [training[i] for i in traincv] mytesting = [training[i] for i in testcv] mytraining_X = np.array([x for (x, y) in mytraining]) mytraining_Y = np.array([y for (x, y) in mytraining]) if len(set(mytraining_Y)) == 1: print("only one label, backing off to KNN.") classifier = KNeighborsClassifier() try: classifier.fit(mytraining_X, mytraining_Y) except ValueError as e: print("failed out on word:", w) print(mytraining_X) print(mytraining_Y) raise(e) print("trained!!", classifier) mytesting_X = np.array([x for (x, y) in mytesting]) mytesting_Y = np.array([y for (x, y) in mytesting]) predicted = classifier.predict(mytesting_X) ncorrect = sum(int(real == pred) for real, pred in zip(mytesting_Y, predicted)) out[w].append((ncorrect,len(mytesting))) return out
def make_decision(node): """Make a potentially-terrible decision.""" options = [child for child in node if child.tag == 'SYN'] choice = random.choice(options) dprint("[OPTIONS]", " ".join(opt.attrib['lem'] for opt in options)) ##print("I have randomly chosen:", choice.attrib['lem']) for k,v in choice.attrib.items(): node.attrib[k] = v ## remove the syn nodes. for option in options: node.remove(option)
def load_bible(fn): out = {} with open(fn) as infile: for line in infile: line = line.strip() verseid, text = line.split('\t') ## just to check... if verseid in out: util.dprint("{0} already in table {1}".format(verseid, "DIFFERENT" if text != out[verseid] else "SAME")) out[verseid] = text return out
def set_of_verses(fn): """Return the set of verses found in the given filename.""" out = set() with open(fn) as infile: for line in infile: line = line.strip() verse, text = line.split("\t") if verse in out: util.dprint("WARNING duplicate verse {0} in {1}".format( verse, fn)) out.add(verse) return out
def offset(lemma, idx): "Finds offest of <lemma> in sentence." words = tokenizer(sentence) try: w = words[idx] word = w[0] if len(w) == 2 else w start = sentence.find(word) return dict(start=start, end=start + len(word)) except IndexError: dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:') dpprint((idx, words)) return dict(start=-1, end=-1)
def get_tuples(corpus): """Find all the nodes in the tree, return the list of source-language tuples.""" target_nodes = corpus.findall(".//NODE") tokens = [] for node in target_nodes: ref = node.attrib['ref'] try: theref = int(ref) except: dprint("REFISNOTINT:", ref) theref = int(float(ref)) sform = node.attrib['sform'] slem = node.attrib['slem'] tokens.append((theref, sform, slem)) tokens.sort() return tokens
def convert_csv_to_json(input, output, verbose=True): # read the file using pandas. Some lines at a time if verbose==True: dprint('Reading file {}'.format(input)) # read the file and write it try: remove(output) except: pass with open(output, 'w+') as out: for chun in pd.read_csv(input, delimiter=';', quotechar='"', chunksize=100000): chun.to_json(out, orient='records') if verbose==True: dprint('Finished writing {}'.format(output))
def path_finder(goal): current = 0 # keep track of node resource start_time = time.time() # keep track of time resource path_found_bool = False path_found = tuple([1000, 1000]) dprint(path_found) dprint(bool(path_found == None)) # number of seconds, it was originally 25, setting it to one for brevity while time_limit(start_time, limit=2) is True and path_found_bool == False: # while node_limit(current, limit=50) is True: # run node_expander i number of times before checking time elapsed i = 1000 # this needs debugging. TODO those ending brackets specifically while not fringe_nodes.empty() and i > 0 and ( path_found_bool is False or (path_found[0] != 1000 and path_found[0] < min_f[0])): path_found = node_expander(goal) dprint(path_found) dprint(bool(path_found == None)) if path_found[0] != 1000: path_found_bool = True print("# Path found") dprint(path_found) node = Formatting.tuple_to_string(path_found[1]) i -= 1 current += 1 # if goal not found, must have reached resource limit if path_found_bool is False: print("# Resource limit") if None in min_f: print(min_f) node = Formatting.tuple_to_string(min_f[1]) path = [] while node != "root": path.append(node) node = explored_states[node] path.reverse() return path
def generate_split_candidates(phrase, sl, tl): ptentries = [] splits = list(reversed(allsplits(list(phrase)))) dprint(splits) for split in splits: split_strings = [" ".join(entry) for entry in split] found = [] for entry in split_strings: foundsomething = False from_pt = phrasetable.lookup(entry) if from_pt: foundsomething = True found.append(from_pt) elif " " not in entry: frombabelnet = babelnet_candidates(entry, sl, tl) if frombabelnet: foundsomething = True found.append(frombabelnet) if not foundsomething: found.append([]) if all(found): for assignment in itertools.product(*found): target = " ".join(pte.target for pte in assignment) pdirects = [pte.pdirect for pte in assignment] pinverses = [pte.pinverse for pte in assignment] product_pdirect = functools.reduce(operator.mul, pdirects, 1) product_pinverse = functools.reduce(operator.mul, pinverses, 1) entry = PTEntry(source=" ".join(phrase), target=target, pdirect=product_pdirect, pinverse=product_pinverse) ptentries.append(entry) ## XXX: magic number, or maybe "tunable hyperparameter". if len(ptentries) == 10000: return ptentries return ptentries
def rescore_candidates(candidates, weights, leftcontext, rightcontext, sentid, args): pmi_cls = pmi.PMI(args.target) parsefn = "{0}-{1}-{2}-devel".format(args.source, args.target, sentid) parsecache = parser_interface.PARPATH + parsefn + ".conll" parser = parser_interface.Pcandidates(args.target, parsecache) newcandidates = [] allsentences = [] for (score, ptentry, scores) in candidates: sentence = [] sentence.extend(leftcontext.split()) sentence.extend(ptentry.target.split()) sentence.extend(rightcontext.split()) allsentences.append(sentence) ## XXX: make the caching work. if os.path.exists(parsecache): parser.load_new_parse(parsecache, allsentences) else: parser.do_new_parse(allsentences, sentid) for (score, ptentry, scores) in candidates: sentence = [] sentence.extend(leftcontext.split()) sentence.extend(ptentry.target.split()) sentence.extend(rightcontext.split()) lex,pos = parser.find_rels(sentence, ptentry.target.split()) score_lex = nonzero(pmi_cls.sim_lex(lex)) score_pos = nonzero(pmi_cls.sim_pos(pos)) logprob_lex = math.log(score_lex, 10) logprob_pos = math.log(score_pos, 10) dprint("PTENTRY, LEX AND POS:", ptentry.target, score_lex, score_pos) score += (weights["PMI_LEX"] * logprob_lex) score += (weights["PMI_POS"] * logprob_pos) scores = scores + (logprob_lex, logprob_pos) newcandidates.append((score, ptentry, scores)) pmi_cls.dump_cache() return newcandidates
def validate_RRSET(keys, rrsig_set, rr_set, domain_name): """ Validates the signature on an RRset :param keys: The DNSKEYS to check with :param rrsig_set: A set of RRSIGs to check :param rr_set: The RRset :param domain_name: The domain name of the RRset :return: The RRSIG record that verified """ for sig in rrsig_set: if sig.algorithm != DNSPacket.ALGO_TYPE_RSASHA256: dprint("ERROR\tUNKNOWN ALGORITHM", sig.algorithm) return None for set_ordering in itertools.permutations(rr_set, len(rr_set)): rrset_data = crypto.createRRSetData(set_ordering, sig, domain_name) for key in keys: if crypto.verify_signature(sig.signature, key, rrset_data): return sig return None
def main(): seenbooks = set() book = None chapter = None verse = None text = "" for bookfn in sys.argv[1:]: with open(bookfn) as infile: for line in infile: line = line.strip() # print("LINE", line) if any(line.startswith(startmarker) for startmarker in ["\\v", "\\h", "\\c", "\\d"]): if book and chapter and verse and text: clean_and_print(book, chapter, verse, text) text = "" if book and chapter and text and (verse is None): util.dprint("skipping structure information:", text) if line.startswith("\\h"): splitted = line.split(maxsplit=1) bookname = splitted[1] book = booknames.code(bookname) if not book: util.dprint("warning! not a known book:", bookname) else: seenbooks.add(book) elif line.startswith("\\c"): splitted = line.split() chapter = splitted[1] elif line.startswith("\\v"): splitted = line.split(maxsplit=2) verse = splitted[1] text = splitted[2] elif line.startswith("\\d"): splitted = line.split(maxsplit=1) verse = None text = splitted[1] continue else: text = text + " " + line ## hit the end of this file. if book and chapter and verse and text: clean_and_print(book, chapter, verse, text) book = None chapter = None verse = None text = "" util.dprint("Saw this many books out of expected 66:",len(seenbooks)) util.dprint("books we haven't seen:", booknames.knownbooks() - seenbooks)
def main(): seenbooks = set() book = None chapter = None verse = None text = "" for bookfn in sys.argv[1:]: with open(bookfn) as infile: root = ET.fromstring(infile.read()) for seg in root.iter('seg'): seg_id = seg.get('id') _, book, chapter, verse = seg_id.split(".") book = booknames.code(book) if not seg.text: continue text = seg.text.strip() seenbooks.add(book) print("{0}_{1}_{2}\t{3}".format(book, chapter, verse, text)) util.dprint("Saw this many books out of expected 66:",len(seenbooks)) util.dprint("books we haven't seen:", sorted(booknames.knownbooks() - seenbooks)) util.dprint("surprising books:", sorted(seenbooks - booknames.knownbooks()))
def main(): s = xmlrpc.client.ServerProxy('http://localhost:8000') lines = [] for line in sys.stdin: if line.strip(): lines.append(line.strip()) corpus = ET.fromstringlist(lines) for sentence in corpus: sentnum = sentence.attrib['ref'] tuples = lexsel_util.get_tuples(sentence) surface = [tup[1] for tup in tuples] dprint("[SURFACE]", " ".join(surface)) answers = s.label_sentence(tuples) dprint("[ANSWERS]", answers) ## all the NODE elements in the tree that have a SYN underneath target_nodes = sentence.findall(".//NODE/SYN/..") changed = False for node in target_nodes: changed_here = make_decision(node, answers) if changed_here: changed = True if changed: dprint("[CLASSIFIERSENTENCE]", sentnum) print(ET.tostring(corpus,encoding="unicode"))
def setup(jsondata): init() dprint(jsondata.keys()) colour = jsondata.pop("colour") # read through file, add corresponding values to dictionary for key in jsondata.keys(): for i in jsondata[key]: dprint(type(i)) dprint(type(tuple(i))) assign_to_board(key, tuple(i), colour)
def remove_footnotes(book, chapter, verse, text): """Heavy-handedly use a regex to remove footnotes. Returns a new string.""" out = re.sub(FOOTNOTE_PATTERN, "", text) if out == text: if re.search(START_FOOTNOTE_PATTERN, text): util.dprint("not good -- start of footnote but no end?") util.dprint(text) util.dprint(chapter, verse, text) assert False return out
def remove_crossrefs(book, chapter, verse, text): """Heavy-handedly use a regex to remove cross-references. Returns a new string.""" out = re.sub(CROSSREF_PATTERN, "", text) if out == text: if re.search(START_CROSSREF_PATTERN, text): util.dprint("not good -- start of crossref but no end?") util.dprint(text) util.dprint(chapter, verse, text) assert False return out
def create_simple_file(self, out_name, fields, chunk_size= 100000, process_num = 5, all_file = False): # read the file, create the xml count = 0 lines = 0 data = [] dprint('Start processing file {}'.format(self.file_name)) with open(out_name, 'w') as out: with open(self.file_name, 'r') as f: # write the headers writer = csv.writer(out, delimiter=FIELD_SEP, quotechar='"') writer.writerow(fields) for i,line in enumerate(f): try: data.append(line) except: print type(line) continue count +=1 # check if the lines are fetched already if count>=chunk_size: if self.verbose: dprint('Copied {}'.format(count)) lines += self.output_line(data, process_num, out) # check if it is time to leave if not all_file: break count = 0 del(data) data = [] # check if all the file was output if count>0: lines += self.output_line(data, process_num, out) if self.verbose: dprint('Finished processing file {}'.format(self.file_name)) dprint(lines)
def main(): lines = [] for line in sys.stdin: if line.strip(): lines.append(line.strip()) corpus = ET.fromstringlist(lines) dprint("!" * 80) lexsel_util.get_tuples(corpus) dprint(lexsel_util.prettify(corpus)) dprint("!" * 80) ## find all the NODE elements in the tree that have a SYN underneath them target_nodes = corpus.findall(".//NODE/SYN/..") for node in target_nodes: make_decision(node) print(ET.tostring(corpus,encoding="unicode"))
def dump_de(pic): dprint("CHECK DUMPING in dump_de!! size:", len(pic)) pickle.dump(pic,open(PICPATH + "de.cache","wb"))
def load_de(): de = pickle.load(open(PICPATH + "de.cache","rb")) dprint("CHECK LOADING in load_de!! size:", len(de)) return de
def main(): # Handle arguments args = getArgumentDict() domain_name = args['domain-name'] resolver_address = parse_server(args['server']) record = args['record'] if record not in ["A", "DNSKEY", "DS"]: print("ERROR\t" + str(record) + " NOT SUPPORTED") sys.exit(1) if args['debug']: util.debug_print_enabled = True connection = UDPCommunication() query_type = DNSPacket.RR_TYPE_A if record == "DNSKEY": query_type = DNSPacket.RR_TYPE_DNSKEY elif record == "DS": query_type = DNSPacket.RR_TYPE_DS split_domain = domain_name.split('.') parent_domain = '.'.join(split_domain[1:]) # Regardless of query type, we need to verify the chain of trust if not verify_zone(domain_name, connection, resolver_address): print("ERROR\tMISSING-DS") sys.exit(1) if query_type == DNSPacket.RR_TYPE_A: dprint("\n\n\nGetting A Record:") arecord_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_A) arecord_response.print() arecord_response.dump() rr_set = get_rrset(arecord_response, error_if_empty="ERROR\tMISSING-A") rrsig_set = get_rrsigs(arecord_response, error_if_empty="ERROR\tMISSING-RRSIG") dnskey_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_DNSKEY) keys = get_keys(dnskey_response, error_if_empty="ERROR\tMISSING-DNSKEY") key_rrsig_set = get_rrsigs(arecord_response, error_if_empty="ERROR\tMISSING-RRSIG") if validate_RRSET(keys, key_rrsig_set, rr_set, domain_name) is None: print("ERROR\tINVALID-RRSIG") sys.exit(1) associated_rrsig = validate_RRSET(keys, rrsig_set, rr_set, domain_name) if associated_rrsig is not None: for record in rr_set: print_record(record, associated_rrsig, True) else: for record in rr_set: print_record(record, associated_rrsig, False) print("ERROR\tINVALID-RRSIG") sys.exit(1) elif query_type == DNSPacket.RR_TYPE_DNSKEY: dprint("\n\n\nGetting Keys:") dnskey_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_DNSKEY) dprint("\nDNSKEY Record Response packet:") dnskey_response.dump() keys = get_keys(dnskey_response, error_if_empty="ERROR\tMISSING-DNSKEY") rrsig_set = get_rrsigs(dnskey_response, error_if_empty="ERROR\tMISSING-RRSIG") rr_set = get_rrset(dnskey_response) associated_rrsig = validate_RRSET(keys, rrsig_set, rr_set, domain_name) if associated_rrsig is not None: for record in rr_set: print_record(record, associated_rrsig, True) else: for record in rr_set: print_record(record, associated_rrsig, False) print("ERROR\tINVALID-RRSIG") sys.exit(1) elif query_type == DNSPacket.RR_TYPE_DS: dprint("\n\n\nGetting DS records") ds_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_DS) ds_response.dump() ds_rr_set = get_rrset(ds_response, error_if_empty="ERROR\tMISSING-DS") ds_rrsig_set = get_rrsigs(ds_response, error_if_empty="ERROR\tMISSING-RRSIG") dnskey_response = get_packet(connection, parent_domain, resolver_address, DNSPacket.RR_TYPE_DNSKEY) dprint("\nDNSKEY Record Response packet:") keys = get_keys(dnskey_response, error_if_empty="ERROR\tMISSING-DNSKEY") associated_rrsig = validate_RRSET(keys, ds_rrsig_set, ds_rr_set, domain_name) for ds_record in ds_rr_set: if associated_rrsig is not None: print_record(ds_record, associated_rrsig, True) else: print_record(ds_record, associated_rrsig, False)
def verify_zone(domain_name, connection, resolver_address): """ Attempts to verify the public key of the given zone by establishing PKI from root :param domain_name: The domain name to begin at :param connection: A UDP connection object to use :param resolver_address: The address of the resolver :return: True if zone verified, false otherwise """ split_domain = domain_name.split('.') for i in range(len(split_domain)): cur_domain = '.'.join(split_domain[i:]) parent_domain = '.'.join(split_domain[i + 1:]) dprint("\n\nVerifying {0} key using {1}".format(cur_domain, parent_domain)) # Fetch DS records query = DNSPacket.newQuery(cur_domain, DNSPacket.RR_TYPE_DS, using_dnssec=True) connection.sendPacket(resolver_address, query) ds_response = connection.waitForPacket() # Pull DS records out from the response ds_records = [] for answer in ds_response.answers: if answer.type == DNSPacket.RR_TYPE_DS: ds_records.append(answer) if len(ds_records) == 0: return False dprint("\nFound {0} ds records".format(len(ds_records))) # Fetch DNSKEY records query = DNSPacket.newQuery(cur_domain, DNSPacket.RR_TYPE_DNSKEY, using_dnssec=True) connection.sendPacket(resolver_address, query) dnskey_response = connection.waitForPacket() # Pull keys from the response keys = get_keys(dnskey_response) if len(keys) == 0: return False dprint("\nFound {0} keys".format(len(keys))) # Try to validate a key, any key key_validated = False for ds_record in ds_records: for key in keys: ds_digest = ds_record.digest key_hashed = crypto.createDSRecord(key, cur_domain) dprint("\nDS hash: ", ds_digest) dprint("DNSKEY hash:", key_hashed) if ds_digest == key_hashed: dprint("MATCH WOOHOO") key_validated = True break if key_validated: break else: dprint("ERROR: Unable to validate any DNSKEY with parent zone") return False return True
def load_de(): de = pickle.load(open(PICPATH + "de.cache", "rb")) dprint("CHECK LOADING in load_de!! size:", len(de)) return de
def main(): argparser = get_argparser() args = argparser.parse_args() inputfilename = args.infn outputfilename = args.outfn weightsfn = args.weights targetlang = args.target zmert = args.zmert ## if true, output in zmert output format ## load weights for our different features weights = load_weights(weightsfn) dprint(weights) ## load things not in the phrase table. oov_lookup = load_oovs(args.source, args.target) reader = format.Reader(inputfilename) writer = format.Writer(outputfilename, reader.L1, reader.L2) lm = kenlm.LanguageModel(args.lm) phrasetable.set_phrase_table(args.pt) ## dictionary from sentid to [(cand,candsentence) ...] sentencepairs = read_sentencepairs(reader) sent_cand_pairs = sentences_and_candidates(sentencepairs, args, oov_lookup) sentids = sorted(list(sent_cand_pairs.keys())) for sentid in sentids: sentencepair = sentencepairs[sentid] ## now we have a list of (ptentry, list_of_words) candidates = sent_cand_pairs[sentid] inputfragments = list(sentencepair.inputfragments()) assert len(inputfragments) == 1 leftcontext, fragment, rightcontext = inputfragments[0] assert isinstance(fragment, format.Fragment) scored = score_candidates(candidates, weights, leftcontext, rightcontext, lm) scored.sort(reverse=True) tophundred = scored[:100] scored = rescore_candidates(tophundred, weights, leftcontext, rightcontext, sentid, args) if zmert: ## TODO: pull this out into a function ### output the n-best translations in ZMERT format for cand in scored[:10]: translatedvalue = cand[1].target.split() translatedfragment = format.Fragment(tuple(translatedvalue), fragment.id) sentencepair.output = \ sentencepair.replacefragment(fragment, translatedfragment, sentencepair.input) strings = [" ".join(item.value) if type(item) is format.Fragment else item for item in sentencepair.output] text = " ".join(strings) scores = " ".join([str(score) for score in cand[2]]) out = "{0} ||| {1} ||| {2}".format(int(sentencepair.id) - 1, text, scores) print(out) else: print(scored[0]) translatedvalue = scored[0][1].target.split() translatedfragment = format.Fragment(tuple(translatedvalue), fragment.id) if args.oof: for cand in scored[1:5]: alt = format.Alternative(tuple(cand[1].target.split())) translatedfragment.alternatives.append(alt) sentencepair.output = sentencepair.replacefragment(fragment, translatedfragment, sentencepair.input) writer.write(sentencepair) print("Input: " + sentencepair.inputstr(True,"blue")) print("Output: " + sentencepair.outputstr(True,"yellow")) writer.close() reader.close()
def tableau_closed(tree): if util.debug: util.dprint('tableau_closed:') for branch in tree: util.dprint('branch:') for f in branch: util.dprint(' ', fml_to_str(f)) util.dprint() substs = [] for branch in tree: newsubsts = tableau_branch_closed(branch, substs) if util.debug: util.dprint('current substs: ', subst_to_str(substs)) util.dprint('new substs: ', subst_to_str(newsubsts)) if newsubsts == None: return False substs = compose_subst(newsubsts, substs) if util.debug: util.dprint('composed substs:', subst_to_str(substs)) util.dprint() if util.debug: util.dprint('tableau closed with', subst_to_str(substs)) return True
def cross_validate(classifier, top_words, nonnull=False): """Given the most common words in the Spanish corpus, cross-validate our classifiers for each of those.""" ## return a map from word to [(ncorrect,size)] out = defaultdict(list) util.dprint("cross validating this many words:", len(top_words)) loader = word_vectors.EmbeddingLoader(EMBEDDINGS, EMBEDDING_DIM) assert COMBINATION, "need to specify some kind of embedding combination" for w in top_words: util.dprint("cross validating:", w) text_with_labels = trainingdata.text_label_pairs(w, nonnull=nonnull) training = [] for text, index, label in text_with_labels: surfaceword = text[index] if MWEs: text = loader.replace_mwes_in_tokens(text) for i,token in enumerate(text): if surfaceword == token or surfaceword in token.split("_"): index = i break if COMBINATION == "window": startindex = max(index - 3, 0) endindex = min(index + 4, len(text)) word_embeddings = [loader.embedding(text[i]) for i in range(startindex, endindex)] elif COMBINATION == "fullsent": word_embeddings = [loader.embedding(word) for word in text] elif COMBINATION == "pyramid": word_embeddings = [] for position,word in enumerate(text): scaling = (10 - abs(position - index)) / 10 scaling = max(0, scaling) if scaling: vec = scaling * loader.embedding(word) word_embeddings.append(vec) sent_vector = sum(word_embeddings) if type(sent_vector) is not np.ndarray: print(text) print(word_embeddings) print(surfaceword) print(sent_vector) raise ValueError("sent_vector not an array") training.append((sent_vector, label)) print("this many instances for {0}: {1}".format(w, len(training))) labels = set(label for (feat,label) in training) if len(labels) < 2: continue if len(training) < 10: print("not enough samples for", w) continue ## using constant random_state of 0 for reproducibility cv = cross_validation.KFold(len(training), n_folds=10, shuffle=False, random_state=0) for traincv, testcv in cv: mytraining = [training[i] for i in traincv] mytesting = [training[i] for i in testcv] mytraining_X = np.array([x for (x, y) in mytraining]) mytraining_Y = np.array([y for (x, y) in mytraining]) if len(set(mytraining_Y)) == 1: print("only one label, backing off to KNN.") classifier = KNeighborsClassifier() try: classifier.fit(mytraining_X, mytraining_Y) except ValueError as e: print("failed out on word:", w) print(mytraining_X) print(mytraining_Y) raise(e) print("trained!!", classifier) mytesting_X = np.array([x for (x, y) in mytesting]) mytesting_Y = np.array([y for (x, y) in mytesting]) predicted = classifier.predict(mytesting_X) ncorrect = sum(int(real == pred) for real, pred in zip(mytesting_Y, predicted)) out[w].append((ncorrect,len(mytesting))) return out
def update_piece(coords): dprint(type(coords)) dprint(len(coords)) for x in coords: dprint(type(x)) # just in case, converting from 3-tuple to 2-tuple coords = list(map(Formatting.throuple2tuple, coords)) dprint(coords) if len(coords) > 2: print("OH WHOOPS YA GOT AN ERROR") if len(coords)== 2: val = board_dict[coords[0]] board_dict[coords[1]] = val dprint("new: {}".format(board_dict[coords[1]])) # if the piece exits, it just disappears board_dict[coords[0]] = BLANK_SPACE dprint("original: {}".format(board_dict[coords[0]]))
json_out = copy.deepcopy(json_in) return relations, json_out def cluster(lang): from findmet import cluster return cluster[lang] uerr = uwriter(sys.stderr) def dump((n, v), indent=0, stream=uerr): """Dump a relation onto <stream>. """ dprint(u' ' * indent, u'{0[0]}.{0[1]} {1[0]}.{1[1]}'.format(n, v)) class MetaphorFinderEx(object): """Metaphor finder. Efficient version. """ def __init__(self, lang, seed_fname, extend_seeds): def tag_ext(pos): return lambda words: extended(tagged(words, pos)) def tag(pos): return lambda words: tagged(words, pos) noun_fn, verb_fn = cluster(lang) with uopen(seed_fname) as lines: seeds = read_seed(l.rstrip().split() for l in lines)
def m4detect(lang, json_in, seed_fn, invoke_parser=False, extend_seeds=False, **kw): """Metaphor detection using the seed system. :param lang: language (one of 'en', 'es', 'ru', 'fa') :param json_in: the json document object (a dict) containing at least a 'sentences' key :param seed_fn: a list of seeds :param invoke_parser: invoke parser on the sentences in the json doc :param extended_seeds: whether or not try to extend seeds (English only) :returrns: a json_in with a list of the found LMs appended to each sentence """ relations, json_out = extract(json_in, lang, invoke_parser) def counted(relation): return Counter((noun, verb) for rel, noun, verb in dependencies if rel == relation) tokenizer = parserdesc(lang).tokenizer def lm(sentence, relation, seed, noun_l, verb_l): """Outputs a LM with all the required keys. """ def offset(lemma, idx): "Finds offest of <lemma> in sentence." words = tokenizer(sentence) try: w = words[idx] word = w[0] if len(w) == 2 else w start = sentence.find(word) return dict(start=start, end=start + len(word)) except IndexError: dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:') dpprint((idx, words)) return dict(start=-1, end=-1) def dom(word, rel): return dict(offset(word.form, word.idx), lpos=u'{0[0]}.{0[1]}'.format(word.lemma), lemma=word.lemma[0], form=word.form[0], rel=rel) n_rel, v_rel = relation.split('-') noun, verb = rels[noun_l, verb_l] dprint('lm:', '\n noun', noun, '\n verb', verb) return dict(name=u'{0[0]} {1[0]}'.format(noun.lemma, verb.lemma), target=dom(noun, n_rel), source=dom(verb, v_rel), seed=u' '.join(u'%s.%s' % s for s in seed)) # TODO: optimization: this should be created once at the beginning. Perhaps on import? mfinder = MetaphorFinderEx(lang, seed_fn, extend_seeds) # TODO: this is inefficient: Python will evaluate arguments anyway # dprint('All possible metaphors:') # dforeach(partial(dump, indent=1), sorted(mfinder.mbuilder.metaphors)) # relations grouped by sentence id depsbysent = groupby(relations, key=lambda (sent_id, _): sent_id) sentences = json_out['sentences'] for i, deps in ((i - 1, list(deps)) for i, deps in depsbysent): # index deps by <noun-lemma, verb-lemma> pairs rels = dict(((n_l, v_l), (Struct(lemma=n_l, form=n_f, idx=int(n_idx)), Struct(lemma=v_l, form=v_f, idx=int(v_idx)))) for _, (n_idx, v_idx, _, n_f, n_l, v_f, v_l) in deps) mets = mfinder.find(rels.keys()) sent = sentences[i] dprint('_' * 96, '\n', sent['text']) dforeach(partial(dump, indent=1), rels.keys()) lms = [ lm(sent['text'], rel, seed, noun_l, verb_l) for (rel, seed, (noun_l, verb_l)) in mets ] dprint('found LMs:', pformat(lms)) sent['lms'] = lms jsonout = dict((k, v) for k, v in json_in.items() if k != 'sentences') jsonout['sentences'] = sentences return jsonout
def make_decision(node, answers): """Make a potentially-terrible decision.""" default = node.attrib['lem'] option_nodes = [child for child in node if child.tag == 'SYN'] option_lemmas = ([opt.attrib['lem'] for opt in option_nodes] + [default]) dprint("[DEFAULT]", default) dprint("[OPTIONS]", " ".join(option_lemmas)) textref = node.attrib['ref'] try: ref = int(textref) except: dprint("REFISNOTINT:", textref) ref = int(float(textref)) chipa_says = answers[ref - 1] dprint("[CHIPASAYS]", chipa_says) ## chipa_says is the list of things in descending order of goodness. best = None for ans in chipa_says: if ans in option_lemmas: best = ans break choice = None for child in option_nodes: if child.attrib['lem'] == best: dprint("HOLY COW CLASSIFIER MADE A DECISION") choice = child break if choice is None: dprint("CLASSIFIER DIDN'T HELP, BAILING") return True for k,v in choice.attrib.items(): node.attrib[k] = v ## remove the syn nodes. for option_node in option_nodes: node.remove(option_node) return True
def dump_de(pic): dprint("CHECK DUMPING in dump_de!! size:", len(pic)) pickle.dump(pic, open(PICPATH + "de.cache", "wb"))