Esempio n. 1
0
    def define_in_namespace(self, namespace, computational_graph_definition):
        if self.namespace is not None:
            raise ValueError("model may only be initialized once.")

        self.namespace = check.check_not_empty(namespace)
        self.save_name = check.check_not_empty(
            re.sub("[^a-zA-Z0-9]", "", self.namespace))

        with tf.variable_scope(self.namespace):
            logging.debug("Defining computational graph.")
            computational_graph_definition()

        self.saver = tf.train.Saver(
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                              scope=self.namespace))
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        safe_str = ""

        if safe_on:
            config.graph_options.rewrite_options.arithmetic_optimization = rewriter_config_pb2.RewriterConfig.OFF
            safe_str = "safely "

        session = tf.Session(config=config)
        session.run(tf.global_variables_initializer())
        logging.debug(
            "Defined computational graph and %sinitialized session." %
            safe_str)
        return session
Esempio n. 2
0
def perplexity(probabilities):
    check.check_not_empty(probabilities)
    total_log_probability = 0.0

    for probability in probabilities:
        if probability < 0.0 or probability > 1.0:
            raise ValueError("Invalid probability [0, 1]: %f." % probability)

        total_log_probability += math.log2(ZERO_PROBABILITY if probability ==
                                           0 else probability)

    return math.pow(2.0, -total_log_probability / len(probabilities))
Esempio n. 3
0
 def __init__(self, sequence, predicted, expected):
     self.sequence = check.check_not_empty(sequence)
     self.predicted = check.check_instance(predicted, SequenceStats)
     self.expected = check.check_instance(expected, SequenceStats)
     check.check_length(self.predicted.values, len(self.sequence))
     check.check_length(self.expected.values, len(self.sequence))
     self.perplexity = perplexity(self.expected.probabilities)
Esempio n. 4
0
    def __init__(self, name, layer, width):
        self.name = check.check_not_empty(name)
        self.layer = layer

        if layer is not None:
            check.check_gte(layer, 0)

        self.width = check.check_gte(width, 1)
Esempio n. 5
0
 def __init__(self, name, default, plural=False, canonicalize=lambda v: v):
     self.name = check.check_not_empty(name)
     self.default = default
     self.plural = check.check_one_of(plural, [True, False])
     self.canonicalize = canonicalize
     self.key = self.name if not self.plural else "%s[]" % self.name
Esempio n. 6
0
 def __init__(self, words):
     super(Term, self).__init__()
     self.words = tuple(check.check_not_empty(words))
     self._hash = None
Esempio n. 7
0
 def __init__(self, word):
     self.word = check.check_not_empty(check.check_not_none(word))
     self.literal = canonicalize_word(word)
Esempio n. 8
0
    def parse(self, input_stream):
        pages = []
        parse_terms = set()

        for line in input_stream:
            for item in line.split("."):
                page_id = item.strip()

                if page_id != "":
                    if os.path.exists(self._page_file_contents(page_id)):
                        with open(self._page_file_contents(page_id),
                                  "r",
                                  encoding="utf-8") as fh:
                            page_content = fh.read()
                        with open(self._page_file_links(page_id),
                                  "r",
                                  encoding="utf-8") as fh:
                            page_links = [l.strip() for l in fh.readlines()]
                    else:
                        split = page_id.split("#")
                        page = wikipedia.page(split[0])

                        try:
                            if not page.exists():
                                raise errors.Invalid(
                                    "Missing wikipedia page '%s'." % split[0])
                        except requests.exceptions.ReadTimeout as e:
                            raise errors.Invalid(
                                "Missing wikipedia page '%s'." % split[0])

                        if len(split) == 1:
                            page_content = check.check_not_empty(
                                CLEANER(page.summary))
                        else:
                            page_content = ""

                        for section in (page.section_titles
                                        if len(split) == 1 else split[1:]):
                            if section not in self.SECTION_BLACKLIST:
                                logging.debug("Page '%s' using section '%s'." %
                                              (page_id, section))
                                raw_section_content = page.section_by_title(
                                    section).text

                                if raw_section_content is not None and len(
                                        raw_section_content) > 0:
                                    section_content = CLEANER(
                                        raw_section_content)

                                    if len(section_content) > 0:
                                        page_content += " " + section_content

                        page_links = [CLEANER(l) for l in page.links]

                    pages += [page_id]

                    if not os.path.exists(self._page_file_contents(page_id)):
                        with open(self._page_file_contents(page_id),
                                  "w",
                                  encoding="utf-8") as fh:
                            fh.write(page_content.replace("\n", "\n\n"))
                        with open(self._page_file_links(page_id),
                                  "w",
                                  encoding="utf-8") as fh:
                            for link in page_links:
                                fh.write("%s\n" % link)

                    page_terms = set()

                    for page_term in self._extract_links(
                            page_id, page_links, page_content):
                        page_terms.add(page_term)

                    for term in page_terms:
                        self.terms.add(term)

                        if term not in parse_terms:
                            logging.debug("Page '%s' adding term '%s'." %
                                          (page_id, term))
                            parse_terms.add(term)
                            self.inflections.record(term, term)

        terms_trie = build_trie(parse_terms)

        for page_id in pages:
            with open(self._page_file_contents(page_id), "r",
                      encoding="utf-8") as fh:
                page_content = fh.read()

            sentences = nlp.split_sentences(page_content, self.paragraphs)
            maximum_offset = math.ceil(float(len(sentences)) / self.window)

            for offset in range(0, maximum_offset):
                # List of lists                    vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
                sub_corpus = [
                    word for sentence in sentences[offset:offset + self.window]
                    for word in sentence
                ]
                reference_terms = nlp.extract_terms(
                    corpus=sub_corpus,
                    terms_trie=terms_trie,
                    lemmatizer=CANONICALIZER,
                    inflection_recorder=self.inflections.record)
                logging.debug("Page '%s' reference terms: %s" %
                              (page_id, reference_terms))

                for a in reference_terms:
                    for b in reference_terms:
                        if a != b:
                            if a not in self.cooccurrences:
                                self.cooccurrences[a] = {}

                            if b not in self.cooccurrences[a]:
                                self.cooccurrences[a][b] = []

                            #self.cooccurrences[a].add(b)
                            self.cooccurrences[a][b] += [sub_corpus]