コード例 #1
0
    def process_json(args):
        sin = openf(args.json_fn)
        json_in = load(fp=sin, encoding='utf-8')
        sentences_in = json_in['sentences']
        try:
            json_out = parse(lang,
                             [sanitized(s['ctext']) for s in sentences_in])
        except KeyError:
            #         from nltk.tokenize import TreebankWordTokenizer
            #         _tokenize = TreebankWordTokenizer().tokenize
            _j = u' '.join

            def make_ctext(s):  # this has side effects
                ctext = sanitized(s['text'])
                s['ctext'] = ctext
                return ctext

            json_out = parse(lang, [make_ctext(s) for s in sentences_in])

        json_out.update((k, v) for k, v in json_in.items() if k != 'sentences')
        # Sanity check: verify we haven't modified ctext
        if False:
            for idx, (sent_in,
                      sent_out) in enumerate(zip(json_in['sentences'],
                                                 json_out['sentences']),
                                             start=1):
                ctext_in, ctext_out = sent_in['ctext'], sent_out['ctext']
                try:
                    assert ctext_in == ctext_out
                except AssertionError:
                    dprint(u'error at line {}:\n  {}  \n!=\n  {}'.format(
                        idx, ctext_in, ctext_out))

        output(json_out)
コード例 #2
0
    def create_read_iterator(self, line_num=10000):
        '''
        Read  line_num lines for a xml file. Converts differently.

        : line(int, optional): number of lines to read. -1 read whole file
        '''
        # read the file, create the xml
        count = 0
        l = []
        with open(self.file_name, 'r') as f:
            for i,line in enumerate(f):
                try:
                    l.append(dict(BeautifulSoup(line).row.attrs))
                    count +=1
                except:
                    dprint('Error on line {}'.format(line))

                # check if the lines are fetched already
                if count>=line_num and line_num>0:
                    dprint('Processed {}'.format(i))
                    yield DataFrame(l)
                    l = []
                    count = 0

        # in case it is reading all or finished the loop in the middle
        if count != 0:
            yield DataFrame(l)

        return
コード例 #3
0
ファイル: clwsd_experiment.py プロジェクト: alexrudnick/chipa
def cross_validate(classifier, top_words, nonnull=False):
    """Given the most common words in the Spanish corpus, cross-validate our
    classifiers for each of those."""
    ## return a map from word to [(ncorrect,size)]
    out = defaultdict(list)
    util.dprint("cross validating this many words:", len(top_words))
    for w in top_words:
        util.dprint("cross validating:", w)
        training = trainingdata.trainingdata_for(w, nonnull=nonnull)
        labels = set(label for (feat,label) in training)
        if len(labels) < 2:
            continue
        if len(training) < 10:
            print("not enough samples for", w)
            continue
        ## using constant random_state of 0 for reproducibility
        cv = cross_validation.KFold(len(training), n_folds=10,
                                    shuffle=False, random_state=0)
        for traincv, testcv in cv:
            mytraining = [training[i] for i in traincv]
            mytesting = [training[i] for i in testcv]
            mytraining = mytraining + [({"absolutelynotafeature":True},
                                        "absolutelynotalabel")]
            classifier.train(mytraining)
            ncorrect = count_correct(classifier, mytesting)
            out[w].append((ncorrect,len(mytesting)))
    return out
コード例 #4
0
    def lm(sentence, relation, seed, noun_l, verb_l):
        """Outputs a LM with all the required keys.
        """
        def offset(lemma, idx):
            "Finds offest of <lemma> in sentence."
            words = tokenizer(sentence)
            try:
                w = words[idx]
                word = w[0] if len(w) == 2 else w
                start = sentence.find(word)
                return dict(start=start, end=start + len(word))
            except IndexError:
                dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:')
                dpprint((idx, words))
                return dict(start=-1, end=-1)

        def dom(word, rel):
            return dict(offset(word.form, word.idx),
                        lpos=u'{0[0]}.{0[1]}'.format(word.lemma),
                        lemma=word.lemma[0],
                        form=word.form[0],
                        rel=rel)

        n_rel, v_rel = relation.split('-')
        noun, verb = rels[noun_l, verb_l]
        dprint('lm:', '\n  noun', noun, '\n  verb', verb)
        return dict(name=u'{0[0]} {1[0]}'.format(noun.lemma, verb.lemma),
                    target=dom(noun, n_rel),
                    source=dom(verb, v_rel),
                    seed=u' '.join(u'%s.%s' % s for s in seed))
コード例 #5
0
    def split_files_by_year(self):
        '''
        Get the file and split it by year. Get the first line  year and the
        last line year as delta. Uses tail to do that
        '''
        # get the first 3 lines
        head = subprocess.Popen(["head", "-n 3", self.file_name],
                                stdout=subprocess.PIPE).communicate()[0]
        end = subprocess.Popen(["tail", "-n 2", self.file_name],
                                stdout=subprocess.PIPE).communicate()[0]

        dic_list = []
        xml = ElementTree.fromstring(head+end)
        for node in xml.iter('row'):
            dic_list.append(dict(zip(node.attrib.keys(), node.attrib.values())))

        # get only the year of the string 2008-01-01
        start_date = int(dic_list[0]['CreationDate'][0:4])
        end_date = int(dic_list[1]['CreationDate'][0:4])

        # file name we are reading
        base = basename(self.file_name)

        # for each of the years create a file and cat the content there
        process = []
        out_list = []
        num_processes = 0
        for y in range(start_date, end_date+1):
            out = base.replace('.', '{}.'.format(y))
            out = abspath(self.file_name).replace(base, out)
            f_out = open(out, 'w')
            out_list.append(f_out)
            dprint('Processing years {}'.format(y))
            process.append(subprocess.Popen(['egrep',
                                             'CreationDate="{}'.format(y),
                                             self.file_name], stdout=f_out))
            #f_out.write(head.split('\n')[0])
            #f_out.write(head.split('\n')[1])
            num_processes += 1

        # check if the processes are finished
        while True:
            num_finished = 0
            for p in process:
                if p.poll() == 0:
                    num_finished += 1
            if num_finished == num_processes:
                break

            # give the processor some breathing time
            sleep(5)

        # close files
        for o in out_list:
            o.close()

        dprint('Finished creating {} files'.format(num_processes))

        return (start_date, end_date)
コード例 #6
0
ファイル: print_bitext.py プロジェクト: alexrudnick/terere
def shared_verseids(bible1, bible2):
    """Given two hash tables, return the set of keys present in both."""
    keys1 = set(bible1.keys())
    keys2 = set(bible2.keys())
    both = keys1.intersection(keys2)
    util.dprint("intersection has %0.2f%% from keys1, %0.2f%% from keys2" %
        (len(both) / len(keys1) * 100, len(both) / len(keys2) * 100))
    return both
コード例 #7
0
def fix_verseid(verseid, versetext):
    if verseid.count('.') != 2:
        util.dprint("{0}\t{1}".format(verseid, versetext))
        return None
    book, chapter, verse = verseid.split(".")
    book = booknames.code(book)
    assert book in booknames.knownbooks()
    return "{0}_{1}_{2}".format(book,chapter,verse)
コード例 #8
0
 def end(idx):
     i = idx - 1
     try:
         return ss.index(ctext[i]) + len(ctext[i])
     except ValueError:
         dprint(u"can't find {}-th '{}' in '{}'".format(
             i, ctext[i], ss))
         return -1
コード例 #9
0
ファイル: parse_usfm.py プロジェクト: alexrudnick/terere
def remove_speaker_annotations(book, chapter, verse, text):
    """Given the text of a verse, take out the \\sp SPEAKER annotations."""
    out = re.sub(SPEAKER_PATTERN, "", text)
    if out != text:
        matched = re.search(SPEAKER_PATTERN, text)
        speaker = matched.group(0)
        util.dprint("Stripped speaker identification: '{0}' in '{1}'".format(
            speaker, (chapter,verse,text)))
    return out
コード例 #10
0
def cross_validate(classifier, top_words, nonnull=False):
    """Given the most common words in the Spanish corpus, cross-validate our
    classifiers for each of those."""
    ## return a map from word to [(ncorrect,size)]
    out = defaultdict(list)
    util.dprint("cross validating this many words:", len(top_words))

    for w in top_words:
        util.dprint("cross validating:", w)
        doc2vec_labels = trainingdata.doc2vec_labels(w,
                                                     FEATUREPREFIX,
                                                     nonnull=nonnull)
        training = []
        for d2v_string, label in doc2vec_labels:
            sent_vector = np.array([float(x) for x in d2v_string.split("_")])
            training.append((sent_vector, label))

        print("this many instances for {0}: {1}".format(w, len(training)))
        labels = set(label for (feat,label) in training)

        if len(labels) < 2:
            continue
        if len(training) < 10:
            print("not enough samples for", w)
            continue
        ## using constant random_state of 0 for reproducibility
        cv = cross_validation.KFold(len(training), n_folds=10,
                                    shuffle=False, random_state=0)
        for traincv, testcv in cv:
            mytraining = [training[i] for i in traincv]
            mytesting = [training[i] for i in testcv]

            mytraining_X = np.array([x for (x, y) in mytraining])
            mytraining_Y = np.array([y for (x, y) in mytraining])

            if len(set(mytraining_Y)) == 1:
                print("only one label, backing off to KNN.")
                classifier = KNeighborsClassifier()

            try:
                classifier.fit(mytraining_X, mytraining_Y) 
            except ValueError as e:
                print("failed out on word:", w)
                print(mytraining_X)
                print(mytraining_Y)
                raise(e)
            print("trained!!", classifier)

            mytesting_X = np.array([x for (x, y) in mytesting])
            mytesting_Y = np.array([y for (x, y) in mytesting])
            predicted = classifier.predict(mytesting_X)
            ncorrect = sum(int(real == pred) for real, pred
                           in zip(mytesting_Y, predicted))
            out[w].append((ncorrect,len(mytesting)))
    return out
コード例 #11
0
ファイル: random_lexsel.py プロジェクト: alexrudnick/chipa
def make_decision(node):
    """Make a potentially-terrible decision."""
    options = [child for child in node if child.tag == 'SYN']
    choice = random.choice(options)
    dprint("[OPTIONS]", " ".join(opt.attrib['lem'] for opt in options))
    ##print("I have randomly chosen:", choice.attrib['lem'])
    for k,v in choice.attrib.items():
        node.attrib[k] = v
    ## remove the syn nodes.
    for option in options:
        node.remove(option)
コード例 #12
0
ファイル: print_bitext.py プロジェクト: alexrudnick/terere
def load_bible(fn):
    out = {}
    with open(fn) as infile:
        for line in infile:
            line = line.strip()
            verseid, text = line.split('\t')
            ## just to check...
            if verseid in out:
                util.dprint("{0} already in table {1}".format(verseid,
                    "DIFFERENT" if text != out[verseid] else "SAME"))
            out[verseid] = text
    return out
コード例 #13
0
ファイル: unify_bibles.py プロジェクト: alexrudnick/terere
def set_of_verses(fn):
    """Return the set of verses found in the given filename."""
    out = set()
    with open(fn) as infile:
        for line in infile:
            line = line.strip()
            verse, text = line.split("\t")
            if verse in out:
                util.dprint("WARNING duplicate verse {0} in {1}".format(
                    verse, fn))
            out.add(verse)
    return out
コード例 #14
0
 def offset(lemma, idx):
     "Finds offest of <lemma> in sentence."
     words = tokenizer(sentence)
     try:
         w = words[idx]
         word = w[0] if len(w) == 2 else w
         start = sentence.find(word)
         return dict(start=start, end=start + len(word))
     except IndexError:
         dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:')
         dpprint((idx, words))
         return dict(start=-1, end=-1)
コード例 #15
0
ファイル: lexsel_util.py プロジェクト: alexrudnick/chipa
def get_tuples(corpus):
    """Find all the nodes in the tree, return the list of source-language
    tuples."""
    target_nodes = corpus.findall(".//NODE")
    tokens = []
    for node in target_nodes:
        ref = node.attrib['ref']
        try:
            theref = int(ref)
        except:
            dprint("REFISNOTINT:", ref)
            theref = int(float(ref))
        sform = node.attrib['sform']
        slem = node.attrib['slem']
        tokens.append((theref, sform, slem))
    tokens.sort()
    return tokens
コード例 #16
0
def convert_csv_to_json(input, output, verbose=True):
    # read the file using pandas. Some lines at a time
    if verbose==True:
        dprint('Reading file {}'.format(input))

    # read the file and write it
    try:
        remove(output)
    except:
        pass

    with open(output, 'w+') as out:
        for chun in pd.read_csv(input, delimiter=';', quotechar='"', chunksize=100000):
            chun.to_json(out, orient='records')

    if verbose==True:
        dprint('Finished writing {}'.format(output))
コード例 #17
0
def path_finder(goal):
    current = 0  # keep track of node resource
    start_time = time.time()  # keep track of time resource

    path_found_bool = False
    path_found = tuple([1000, 1000])
    dprint(path_found)
    dprint(bool(path_found == None))

    # number of seconds, it was originally 25, setting it to one for brevity
    while time_limit(start_time, limit=2) is True and path_found_bool == False:
        # while node_limit(current, limit=50) is True:
        # run node_expander i number of times before checking time elapsed
        i = 1000
        # this needs debugging. TODO those ending brackets specifically
        while not fringe_nodes.empty() and i > 0 and (
                path_found_bool is False or
            (path_found[0] != 1000 and path_found[0] < min_f[0])):

            path_found = node_expander(goal)

            dprint(path_found)
            dprint(bool(path_found == None))
            if path_found[0] != 1000:
                path_found_bool = True
                print("# Path found")
                dprint(path_found)
                node = Formatting.tuple_to_string(path_found[1])

            i -= 1
        current += 1

    # if goal not found, must have reached resource limit
    if path_found_bool is False:
        print("# Resource limit")
        if None in min_f:
            print(min_f)
        node = Formatting.tuple_to_string(min_f[1])

    path = []
    while node != "root":
        path.append(node)
        node = explored_states[node]
    path.reverse()

    return path
コード例 #18
0
def generate_split_candidates(phrase, sl, tl):
    ptentries = []

    splits = list(reversed(allsplits(list(phrase))))
    dprint(splits)

    for split in splits:
        split_strings = [" ".join(entry) for entry in split]

        found = []
        for entry in split_strings:
            foundsomething = False
            from_pt = phrasetable.lookup(entry)
            if from_pt:
                foundsomething = True
                found.append(from_pt)
            elif " " not in entry:
                frombabelnet = babelnet_candidates(entry, sl, tl)
                if frombabelnet:
                    foundsomething = True
                    found.append(frombabelnet)
            if not foundsomething:
                found.append([])

        if all(found):
            for assignment in itertools.product(*found):
                target = " ".join(pte.target for pte in assignment)
                pdirects = [pte.pdirect for pte in assignment]
                pinverses = [pte.pinverse for pte in assignment]

                product_pdirect = functools.reduce(operator.mul, pdirects, 1)
                product_pinverse = functools.reduce(operator.mul, pinverses, 1)

                entry = PTEntry(source=" ".join(phrase),
                                target=target,
                                pdirect=product_pdirect,
                                pinverse=product_pinverse)
                ptentries.append(entry)

                ## XXX: magic number, or maybe "tunable hyperparameter".
                if len(ptentries) == 10000:
                    return ptentries
    return ptentries
コード例 #19
0
def rescore_candidates(candidates, weights, leftcontext, rightcontext, sentid, args):
    pmi_cls = pmi.PMI(args.target)
    parsefn = "{0}-{1}-{2}-devel".format(args.source, args.target, sentid)
    parsecache = parser_interface.PARPATH + parsefn + ".conll"
    parser = parser_interface.Pcandidates(args.target, parsecache)
    newcandidates = []

    allsentences = []
    for (score, ptentry, scores) in candidates:
        sentence = []
        sentence.extend(leftcontext.split())
        sentence.extend(ptentry.target.split())
        sentence.extend(rightcontext.split())
        allsentences.append(sentence)

    ## XXX: make the caching work.
    if os.path.exists(parsecache):
        parser.load_new_parse(parsecache, allsentences)
    else:
        parser.do_new_parse(allsentences, sentid) 

    for (score, ptentry, scores) in candidates:
        sentence = []
        sentence.extend(leftcontext.split())
        sentence.extend(ptentry.target.split())
        sentence.extend(rightcontext.split())

        lex,pos = parser.find_rels(sentence, ptentry.target.split())
        score_lex = nonzero(pmi_cls.sim_lex(lex))
        score_pos = nonzero(pmi_cls.sim_pos(pos))

        logprob_lex = math.log(score_lex, 10)
        logprob_pos = math.log(score_pos, 10)

        dprint("PTENTRY, LEX AND POS:", ptentry.target, score_lex, score_pos)

        score += (weights["PMI_LEX"] * logprob_lex)
        score += (weights["PMI_POS"] * logprob_pos)
        scores = scores + (logprob_lex, logprob_pos)
        newcandidates.append((score, ptentry, scores))
    pmi_cls.dump_cache()

    return newcandidates
コード例 #20
0
def validate_RRSET(keys, rrsig_set, rr_set, domain_name):
    """
    Validates the signature on an RRset
    :param keys: The DNSKEYS to check with
    :param rrsig_set: A set of RRSIGs to check
    :param rr_set: The RRset
    :param domain_name: The domain name of the RRset
    :return: The RRSIG record that verified
    """
    for sig in rrsig_set:
        if sig.algorithm != DNSPacket.ALGO_TYPE_RSASHA256:
            dprint("ERROR\tUNKNOWN ALGORITHM", sig.algorithm)
            return None
        for set_ordering in itertools.permutations(rr_set, len(rr_set)):
            rrset_data = crypto.createRRSetData(set_ordering, sig, domain_name)
            for key in keys:
                if crypto.verify_signature(sig.signature, key, rrset_data):
                    return sig
    return None
コード例 #21
0
ファイル: parse_usfm.py プロジェクト: alexrudnick/terere
def main():
    seenbooks = set()
    book = None
    chapter = None
    verse = None
    text = ""

    for bookfn in sys.argv[1:]:
        with open(bookfn) as infile:
            for line in infile:
                line = line.strip()
                # print("LINE", line)
                if any(line.startswith(startmarker)
                       for startmarker in ["\\v", "\\h", "\\c", "\\d"]):
                    if book and chapter and verse and text:
                        clean_and_print(book, chapter, verse, text)
                        text = ""
                    if book and chapter and text and (verse is None):
                        util.dprint("skipping structure information:", text)
                    if line.startswith("\\h"):
                        splitted = line.split(maxsplit=1)
                        bookname = splitted[1]
                        book = booknames.code(bookname)
                        if not book:
                            util.dprint("warning! not a known book:", bookname)
                        else:
                            seenbooks.add(book)
                    elif line.startswith("\\c"):
                        splitted = line.split()
                        chapter = splitted[1]
                    elif line.startswith("\\v"):
                        splitted = line.split(maxsplit=2)
                        verse = splitted[1]
                        text = splitted[2]
                    elif line.startswith("\\d"):
                        splitted = line.split(maxsplit=1)
                        verse = None
                        text = splitted[1]
                        continue
                else:
                    text = text + " " + line
            ## hit the end of this file.
            if book and chapter and verse and text:
                clean_and_print(book, chapter, verse, text)
            book = None
            chapter = None
            verse = None
            text = ""


    util.dprint("Saw this many books out of expected 66:",len(seenbooks))
    util.dprint("books we haven't seen:", booknames.knownbooks() - seenbooks)
コード例 #22
0
def main():
    seenbooks = set()
    book = None
    chapter = None
    verse = None
    text = ""

    for bookfn in sys.argv[1:]:
        with open(bookfn) as infile:
            root = ET.fromstring(infile.read())
            for seg in root.iter('seg'):
                seg_id = seg.get('id')
                _, book, chapter, verse = seg_id.split(".")

                book = booknames.code(book)
                if not seg.text:
                    continue

                text = seg.text.strip()
                seenbooks.add(book)

                print("{0}_{1}_{2}\t{3}".format(book, chapter, verse, text))

    util.dprint("Saw this many books out of expected 66:",len(seenbooks))
    util.dprint("books we haven't seen:",
                sorted(booknames.knownbooks() - seenbooks))
    util.dprint("surprising books:",
                sorted(seenbooks - booknames.knownbooks()))
コード例 #23
0
ファイル: lexsel_client.py プロジェクト: alexrudnick/chipa
def main():
    s = xmlrpc.client.ServerProxy('http://localhost:8000')

    lines = []
    for line in sys.stdin:
        if line.strip():
            lines.append(line.strip())
    corpus = ET.fromstringlist(lines)

    for sentence in corpus:
        sentnum = sentence.attrib['ref']
        tuples = lexsel_util.get_tuples(sentence)
        surface = [tup[1] for tup in tuples]
        dprint("[SURFACE]", " ".join(surface))
        answers = s.label_sentence(tuples)
        dprint("[ANSWERS]", answers)
        ## all the NODE elements in the tree that have a SYN underneath
        target_nodes = sentence.findall(".//NODE/SYN/..")
        changed = False
        for node in target_nodes:
            changed_here = make_decision(node, answers)
            if changed_here:
                changed = True
        if changed:
            dprint("[CLASSIFIERSENTENCE]", sentnum)

    print(ET.tostring(corpus,encoding="unicode"))
コード例 #24
0
ファイル: board.py プロジェクト: jay-ng-mc/AiPartA
def setup(jsondata):
    init()
    dprint(jsondata.keys())
    colour = jsondata.pop("colour")
    # read through file, add corresponding values to dictionary
    for key in jsondata.keys():
        for i in jsondata[key]:
            dprint(type(i))
            dprint(type(tuple(i)))
            assign_to_board(key, tuple(i), colour)
コード例 #25
0
ファイル: parse_usfm.py プロジェクト: alexrudnick/terere
def remove_footnotes(book, chapter, verse, text):
    """Heavy-handedly use a regex to remove footnotes. Returns a new string."""
    out = re.sub(FOOTNOTE_PATTERN, "", text)
    if out == text:
        if re.search(START_FOOTNOTE_PATTERN, text):
            util.dprint("not good -- start of footnote but no end?")
            util.dprint(text)
            util.dprint(chapter, verse, text)
            assert False
    return out
コード例 #26
0
ファイル: parse_usfm.py プロジェクト: alexrudnick/terere
def remove_crossrefs(book, chapter, verse, text):
    """Heavy-handedly use a regex to remove cross-references. Returns a new
    string."""
    out = re.sub(CROSSREF_PATTERN, "", text)
    if out == text:
        if re.search(START_CROSSREF_PATTERN, text):
            util.dprint("not good -- start of crossref but no end?")
            util.dprint(text)
            util.dprint(chapter, verse, text)
            assert False
    return out
コード例 #27
0
    def create_simple_file(self, out_name, fields,
                           chunk_size= 100000,
                           process_num = 5,
                           all_file = False):

        # read the file, create the xml
        count = 0
        lines = 0
        data = []
        dprint('Start processing file {}'.format(self.file_name))
        with open(out_name, 'w') as out:
            with open(self.file_name, 'r') as f:
                # write the headers
                writer = csv.writer(out, delimiter=FIELD_SEP, quotechar='"')
                writer.writerow(fields)
                for i,line in enumerate(f):
                    try:
                        data.append(line)
                    except:
                        print type(line)
                        continue
                    count +=1

                    # check if the lines are fetched already
                    if count>=chunk_size:
                        if self.verbose:
                            dprint('Copied {}'.format(count))

                        lines += self.output_line(data, process_num, out)

                        # check if it is time to leave
                        if not all_file:
                            break
                        count = 0
                        del(data)
                        data = []

                # check if all the file was output
                if count>0:
                    lines += self.output_line(data, process_num, out)

        if self.verbose:
            dprint('Finished processing file {}'.format(self.file_name))
            dprint(lines)
コード例 #28
0
ファイル: random_lexsel.py プロジェクト: alexrudnick/chipa
def main():
    lines = []
    for line in sys.stdin:
        if line.strip():
            lines.append(line.strip())
    corpus = ET.fromstringlist(lines)

    dprint("!" * 80)
    lexsel_util.get_tuples(corpus)
    dprint(lexsel_util.prettify(corpus))
    dprint("!" * 80)

    ## find all the NODE elements in the tree that have a SYN underneath them
    target_nodes = corpus.findall(".//NODE/SYN/..")
    for node in target_nodes:
        make_decision(node)

    print(ET.tostring(corpus,encoding="unicode"))
コード例 #29
0
def dump_de(pic):
    dprint("CHECK DUMPING in dump_de!! size:", len(pic))
    pickle.dump(pic,open(PICPATH + "de.cache","wb"))
コード例 #30
0
def load_de():
    de = pickle.load(open(PICPATH + "de.cache","rb"))
    dprint("CHECK LOADING in load_de!! size:", len(de))
    return de
コード例 #31
0
def main():
    # Handle arguments
    args = getArgumentDict()
    domain_name = args['domain-name']

    resolver_address = parse_server(args['server'])

    record = args['record']
    if record not in ["A", "DNSKEY", "DS"]:
        print("ERROR\t" + str(record) + " NOT SUPPORTED")
        sys.exit(1)

    if args['debug']:
        util.debug_print_enabled = True

    connection = UDPCommunication()

    query_type = DNSPacket.RR_TYPE_A
    if record == "DNSKEY":
        query_type = DNSPacket.RR_TYPE_DNSKEY
    elif record == "DS":
        query_type = DNSPacket.RR_TYPE_DS

    split_domain = domain_name.split('.')
    parent_domain = '.'.join(split_domain[1:])

    # Regardless of query type, we need to verify the chain of trust
    if not verify_zone(domain_name, connection, resolver_address):
        print("ERROR\tMISSING-DS")
        sys.exit(1)

    if query_type == DNSPacket.RR_TYPE_A:
        dprint("\n\n\nGetting A Record:")
        arecord_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_A)
        arecord_response.print()
        arecord_response.dump()
        rr_set = get_rrset(arecord_response, error_if_empty="ERROR\tMISSING-A")
        rrsig_set = get_rrsigs(arecord_response, error_if_empty="ERROR\tMISSING-RRSIG")
        dnskey_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_DNSKEY)

        keys = get_keys(dnskey_response, error_if_empty="ERROR\tMISSING-DNSKEY")
        key_rrsig_set = get_rrsigs(arecord_response, error_if_empty="ERROR\tMISSING-RRSIG")
        if validate_RRSET(keys, key_rrsig_set, rr_set, domain_name) is None:
            print("ERROR\tINVALID-RRSIG")
            sys.exit(1)

        associated_rrsig = validate_RRSET(keys, rrsig_set, rr_set, domain_name)
        if associated_rrsig is not None:
            for record in rr_set:
                print_record(record, associated_rrsig, True)
        else:
            for record in rr_set:
                print_record(record, associated_rrsig, False)
            print("ERROR\tINVALID-RRSIG")
            sys.exit(1)

    elif query_type == DNSPacket.RR_TYPE_DNSKEY:
        dprint("\n\n\nGetting Keys:")
        dnskey_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_DNSKEY)
        dprint("\nDNSKEY Record Response packet:")
        dnskey_response.dump()
        keys = get_keys(dnskey_response, error_if_empty="ERROR\tMISSING-DNSKEY")
        rrsig_set = get_rrsigs(dnskey_response, error_if_empty="ERROR\tMISSING-RRSIG")
        rr_set = get_rrset(dnskey_response)

        associated_rrsig = validate_RRSET(keys, rrsig_set, rr_set, domain_name)
        if associated_rrsig is not None:
            for record in rr_set:
                print_record(record, associated_rrsig, True)
        else:
            for record in rr_set:
                print_record(record, associated_rrsig, False)
            print("ERROR\tINVALID-RRSIG")
            sys.exit(1)

    elif query_type == DNSPacket.RR_TYPE_DS:
        dprint("\n\n\nGetting DS records")
        ds_response = get_packet(connection, domain_name, resolver_address, DNSPacket.RR_TYPE_DS)
        ds_response.dump()
        ds_rr_set = get_rrset(ds_response, error_if_empty="ERROR\tMISSING-DS")
        ds_rrsig_set = get_rrsigs(ds_response, error_if_empty="ERROR\tMISSING-RRSIG")

        dnskey_response = get_packet(connection, parent_domain, resolver_address, DNSPacket.RR_TYPE_DNSKEY)
        dprint("\nDNSKEY Record Response packet:")
        keys = get_keys(dnskey_response, error_if_empty="ERROR\tMISSING-DNSKEY")

        associated_rrsig = validate_RRSET(keys, ds_rrsig_set, ds_rr_set, domain_name)
        for ds_record in ds_rr_set:
            if associated_rrsig is not None:
                print_record(ds_record, associated_rrsig, True)
            else:
                print_record(ds_record, associated_rrsig, False)
コード例 #32
0
def verify_zone(domain_name, connection, resolver_address):
    """
    Attempts to verify the public key of the given zone by establishing PKI from root
    :param domain_name: The domain name to begin at
    :param connection: A UDP connection object to use
    :param resolver_address: The address of the resolver
    :return: True if zone verified, false otherwise
    """
    split_domain = domain_name.split('.')
    for i in range(len(split_domain)):
        cur_domain = '.'.join(split_domain[i:])
        parent_domain = '.'.join(split_domain[i + 1:])
        dprint("\n\nVerifying {0} key using {1}".format(cur_domain, parent_domain))

        # Fetch DS records
        query = DNSPacket.newQuery(cur_domain, DNSPacket.RR_TYPE_DS, using_dnssec=True)
        connection.sendPacket(resolver_address, query)
        ds_response = connection.waitForPacket()

        # Pull DS records out from the response
        ds_records = []
        for answer in ds_response.answers:
            if answer.type == DNSPacket.RR_TYPE_DS:
                ds_records.append(answer)
        if len(ds_records) == 0:
            return False
        dprint("\nFound {0} ds records".format(len(ds_records)))

        # Fetch DNSKEY records
        query = DNSPacket.newQuery(cur_domain, DNSPacket.RR_TYPE_DNSKEY, using_dnssec=True)
        connection.sendPacket(resolver_address, query)
        dnskey_response = connection.waitForPacket()

        # Pull keys from the response
        keys = get_keys(dnskey_response)
        if len(keys) == 0:
            return False
        dprint("\nFound {0} keys".format(len(keys)))

        # Try to validate a key, any key
        key_validated = False
        for ds_record in ds_records:
            for key in keys:
                ds_digest = ds_record.digest
                key_hashed = crypto.createDSRecord(key, cur_domain)
                dprint("\nDS hash: ", ds_digest)
                dprint("DNSKEY hash:", key_hashed)
                if ds_digest == key_hashed:
                    dprint("MATCH WOOHOO")
                    key_validated = True
                    break
            if key_validated:
                break
        else:
            dprint("ERROR: Unable to validate any DNSKEY with parent zone")
            return False
    return True
コード例 #33
0
def load_de():
    de = pickle.load(open(PICPATH + "de.cache", "rb"))
    dprint("CHECK LOADING in load_de!! size:", len(de))
    return de
コード例 #34
0
def main():
    argparser = get_argparser()
    args = argparser.parse_args()
    inputfilename = args.infn
    outputfilename = args.outfn
    weightsfn = args.weights
    targetlang = args.target

    zmert = args.zmert ## if true, output in zmert output format

    ## load weights for our different features
    weights = load_weights(weightsfn)
    dprint(weights)

    ## load things not in the phrase table.
    oov_lookup = load_oovs(args.source, args.target)

    reader = format.Reader(inputfilename)
    writer = format.Writer(outputfilename, reader.L1, reader.L2)

    lm = kenlm.LanguageModel(args.lm)
    phrasetable.set_phrase_table(args.pt)

    ## dictionary from sentid to [(cand,candsentence) ...]
    sentencepairs = read_sentencepairs(reader)
    sent_cand_pairs = sentences_and_candidates(sentencepairs, args, oov_lookup)

    sentids = sorted(list(sent_cand_pairs.keys()))

    for sentid in sentids:
        sentencepair = sentencepairs[sentid]
        ## now we have a list of (ptentry, list_of_words)
        candidates = sent_cand_pairs[sentid]

        inputfragments = list(sentencepair.inputfragments())
        assert len(inputfragments) == 1
        leftcontext, fragment, rightcontext = inputfragments[0]
        assert isinstance(fragment, format.Fragment)

        scored = score_candidates(candidates,
                                  weights,
                                  leftcontext,
                                  rightcontext,
                                  lm)
        scored.sort(reverse=True)

        tophundred = scored[:100]
        scored = rescore_candidates(tophundred, weights, leftcontext,
                                    rightcontext, sentid, args)

        if zmert:
            ## TODO: pull this out into a function
            ### output the n-best translations in ZMERT format
            for cand in scored[:10]:
                translatedvalue = cand[1].target.split()
                translatedfragment = format.Fragment(tuple(translatedvalue),
                                                     fragment.id)
                sentencepair.output = \
                    sentencepair.replacefragment(fragment, translatedfragment,
                                                 sentencepair.input)
                strings = [" ".join(item.value) if type(item) is format.Fragment
                                                else item
                           for item in sentencepair.output]
                text = " ".join(strings)
                scores = " ".join([str(score) for score in cand[2]])
                out = "{0} ||| {1} ||| {2}".format(int(sentencepair.id) - 1,
                                                   text,
                                                   scores)
                print(out)
        else:
            print(scored[0])
            translatedvalue = scored[0][1].target.split()
            translatedfragment = format.Fragment(tuple(translatedvalue), fragment.id)

            if args.oof:
                for cand in scored[1:5]:
                    alt = format.Alternative(tuple(cand[1].target.split()))
                    translatedfragment.alternatives.append(alt)

            sentencepair.output = sentencepair.replacefragment(fragment,
                                                               translatedfragment,
                                                               sentencepair.input)

            writer.write(sentencepair)
            print("Input: " + sentencepair.inputstr(True,"blue"))
            print("Output: " + sentencepair.outputstr(True,"yellow"))

    writer.close()
    reader.close()
コード例 #35
0
ファイル: pred.py プロジェクト: winobes/modal-logic
def tableau_closed(tree):
    if util.debug:
        util.dprint('tableau_closed:')
        for branch in tree:
            util.dprint('branch:')
            for f in branch:
                util.dprint('  ', fml_to_str(f))
        util.dprint()
    substs = []
    for branch in tree:
        newsubsts = tableau_branch_closed(branch, substs)
        if util.debug:
            util.dprint('current substs: ', subst_to_str(substs))
            util.dprint('new substs:     ', subst_to_str(newsubsts))
        if newsubsts == None:
            return False
        substs = compose_subst(newsubsts, substs)
        if util.debug:
            util.dprint('composed substs:', subst_to_str(substs))
            util.dprint()
    if util.debug:
        util.dprint('tableau closed with', subst_to_str(substs))
    return True
コード例 #36
0
def cross_validate(classifier, top_words, nonnull=False):
    """Given the most common words in the Spanish corpus, cross-validate our
    classifiers for each of those."""
    ## return a map from word to [(ncorrect,size)]
    out = defaultdict(list)
    util.dprint("cross validating this many words:", len(top_words))

    loader = word_vectors.EmbeddingLoader(EMBEDDINGS, EMBEDDING_DIM)

    assert COMBINATION, "need to specify some kind of embedding combination"

    for w in top_words:
        util.dprint("cross validating:", w)
        text_with_labels = trainingdata.text_label_pairs(w, nonnull=nonnull)

        training = []
        for text, index, label in text_with_labels:
            surfaceword = text[index]
            if MWEs:
                text = loader.replace_mwes_in_tokens(text)
                for i,token in enumerate(text):
                    if surfaceword == token or surfaceword in token.split("_"):
                        index = i
                        break

            if COMBINATION == "window":
                startindex = max(index - 3, 0)
                endindex = min(index + 4, len(text))
                word_embeddings = [loader.embedding(text[i])
                                   for i in range(startindex, endindex)]
            elif COMBINATION == "fullsent":
                word_embeddings = [loader.embedding(word) for word in text]
            elif COMBINATION == "pyramid":
                word_embeddings = []
                for position,word in enumerate(text): 
                    scaling = (10 - abs(position - index)) / 10
                    scaling = max(0, scaling)
                    if scaling:
                        vec = scaling * loader.embedding(word)
                        word_embeddings.append(vec)

            sent_vector = sum(word_embeddings)
            if type(sent_vector) is not np.ndarray:
                print(text)
                print(word_embeddings)
                print(surfaceword)
                print(sent_vector)
                raise ValueError("sent_vector not an array")
            training.append((sent_vector, label))
        print("this many instances for {0}: {1}".format(w, len(training)))
        labels = set(label for (feat,label) in training)

        if len(labels) < 2:
            continue
        if len(training) < 10:
            print("not enough samples for", w)
            continue
        ## using constant random_state of 0 for reproducibility
        cv = cross_validation.KFold(len(training), n_folds=10,
                                    shuffle=False, random_state=0)
        for traincv, testcv in cv:
            mytraining = [training[i] for i in traincv]
            mytesting = [training[i] for i in testcv]

            mytraining_X = np.array([x for (x, y) in mytraining])
            mytraining_Y = np.array([y for (x, y) in mytraining])

            if len(set(mytraining_Y)) == 1:
                print("only one label, backing off to KNN.")
                classifier = KNeighborsClassifier()

            try:
                classifier.fit(mytraining_X, mytraining_Y) 
            except ValueError as e:
                print("failed out on word:", w)
                print(mytraining_X)
                print(mytraining_Y)
                raise(e)
            print("trained!!", classifier)

            mytesting_X = np.array([x for (x, y) in mytesting])
            mytesting_Y = np.array([y for (x, y) in mytesting])
            predicted = classifier.predict(mytesting_X)
            ncorrect = sum(int(real == pred) for real, pred
                           in zip(mytesting_Y, predicted))
            out[w].append((ncorrect,len(mytesting)))
    return out
コード例 #37
0
ファイル: board.py プロジェクト: jay-ng-mc/AiPartA
def update_piece(coords):
        dprint(type(coords))
        dprint(len(coords))
        for x in coords:
                dprint(type(x))
        # just in case, converting from 3-tuple to 2-tuple
        coords = list(map(Formatting.throuple2tuple, coords))
        dprint(coords)
        if len(coords) > 2:
                print("OH WHOOPS YA GOT AN ERROR")
        if len(coords)== 2:
                val = board_dict[coords[0]]
                board_dict[coords[1]] = val
                dprint("new: {}".format(board_dict[coords[1]]))
        # if the piece exits, it just disappears
        board_dict[coords[0]] = BLANK_SPACE
        dprint("original: {}".format(board_dict[coords[0]]))
コード例 #38
0
        json_out = copy.deepcopy(json_in)
    return relations, json_out


def cluster(lang):
    from findmet import cluster
    return cluster[lang]


uerr = uwriter(sys.stderr)


def dump((n, v), indent=0, stream=uerr):
    """Dump a relation onto <stream>.
    """
    dprint(u' ' * indent, u'{0[0]}.{0[1]} {1[0]}.{1[1]}'.format(n, v))


class MetaphorFinderEx(object):
    """Metaphor finder. Efficient version.
    """
    def __init__(self, lang, seed_fname, extend_seeds):
        def tag_ext(pos):
            return lambda words: extended(tagged(words, pos))

        def tag(pos):
            return lambda words: tagged(words, pos)

        noun_fn, verb_fn = cluster(lang)
        with uopen(seed_fname) as lines:
            seeds = read_seed(l.rstrip().split() for l in lines)
コード例 #39
0
def m4detect(lang,
             json_in,
             seed_fn,
             invoke_parser=False,
             extend_seeds=False,
             **kw):
    """Metaphor detection using the seed system.

    :param lang: language (one of 'en', 'es', 'ru', 'fa')
    :param json_in: the json document object (a dict) containing at least a 'sentences' key
    :param seed_fn: a list of seeds
    :param invoke_parser: invoke parser on the sentences in the json doc
    :param extended_seeds: whether or not try to extend seeds (English only)
    :returrns: a json_in with a list of the found LMs appended to each sentence
    """
    relations, json_out = extract(json_in, lang, invoke_parser)

    def counted(relation):
        return Counter((noun, verb) for rel, noun, verb in dependencies
                       if rel == relation)

    tokenizer = parserdesc(lang).tokenizer

    def lm(sentence, relation, seed, noun_l, verb_l):
        """Outputs a LM with all the required keys.
        """
        def offset(lemma, idx):
            "Finds offest of <lemma> in sentence."
            words = tokenizer(sentence)
            try:
                w = words[idx]
                word = w[0] if len(w) == 2 else w
                start = sentence.find(word)
                return dict(start=start, end=start + len(word))
            except IndexError:
                dprint(u'Problem finding offset of', lemma, 'at', idx, 'in:')
                dpprint((idx, words))
                return dict(start=-1, end=-1)

        def dom(word, rel):
            return dict(offset(word.form, word.idx),
                        lpos=u'{0[0]}.{0[1]}'.format(word.lemma),
                        lemma=word.lemma[0],
                        form=word.form[0],
                        rel=rel)

        n_rel, v_rel = relation.split('-')
        noun, verb = rels[noun_l, verb_l]
        dprint('lm:', '\n  noun', noun, '\n  verb', verb)
        return dict(name=u'{0[0]} {1[0]}'.format(noun.lemma, verb.lemma),
                    target=dom(noun, n_rel),
                    source=dom(verb, v_rel),
                    seed=u' '.join(u'%s.%s' % s for s in seed))

    # TODO: optimization: this should be created once at the beginning. Perhaps on import?
    mfinder = MetaphorFinderEx(lang, seed_fn, extend_seeds)

    # TODO: this is inefficient: Python will evaluate arguments anyway
    #     dprint('All possible metaphors:')
    #     dforeach(partial(dump, indent=1), sorted(mfinder.mbuilder.metaphors))

    # relations grouped by sentence id
    depsbysent = groupby(relations, key=lambda (sent_id, _): sent_id)
    sentences = json_out['sentences']
    for i, deps in ((i - 1, list(deps)) for i, deps in depsbysent):
        # index deps by <noun-lemma, verb-lemma> pairs
        rels = dict(((n_l, v_l), (Struct(lemma=n_l, form=n_f, idx=int(n_idx)),
                                  Struct(lemma=v_l, form=v_f, idx=int(v_idx))))
                    for _, (n_idx, v_idx, _, n_f, n_l, v_f, v_l) in deps)
        mets = mfinder.find(rels.keys())
        sent = sentences[i]
        dprint('_' * 96, '\n', sent['text'])
        dforeach(partial(dump, indent=1), rels.keys())
        lms = [
            lm(sent['text'], rel, seed, noun_l, verb_l)
            for (rel, seed, (noun_l, verb_l)) in mets
        ]
        dprint('found LMs:', pformat(lms))
        sent['lms'] = lms

    jsonout = dict((k, v) for k, v in json_in.items() if k != 'sentences')
    jsonout['sentences'] = sentences
    return jsonout
コード例 #40
0
ファイル: lexsel_client.py プロジェクト: alexrudnick/chipa
def make_decision(node, answers):
    """Make a potentially-terrible decision."""
    default = node.attrib['lem']
    option_nodes = [child for child in node if child.tag == 'SYN']
    option_lemmas = ([opt.attrib['lem'] for opt in option_nodes] +
                     [default])

    dprint("[DEFAULT]", default)
    dprint("[OPTIONS]", " ".join(option_lemmas))

    textref = node.attrib['ref']
    try:
        ref = int(textref)
    except:
        dprint("REFISNOTINT:", textref)
        ref = int(float(textref))
        
    chipa_says = answers[ref - 1]
    dprint("[CHIPASAYS]", chipa_says)

    ## chipa_says is the list of things in descending order of goodness.
    best = None
    for ans in chipa_says:
        if ans in option_lemmas:
            best = ans
            break

    choice = None
    for child in option_nodes:
        if child.attrib['lem'] == best:
            dprint("HOLY COW CLASSIFIER MADE A DECISION")
            choice = child
            break
    if choice is None:
        dprint("CLASSIFIER DIDN'T HELP, BAILING")
        return True

    for k,v in choice.attrib.items():
        node.attrib[k] = v
    ## remove the syn nodes.
    for option_node in option_nodes:
        node.remove(option_node)
    return True
コード例 #41
0
def dump_de(pic):
    dprint("CHECK DUMPING in dump_de!! size:", len(pic))
    pickle.dump(pic, open(PICPATH + "de.cache", "wb"))