Beispiel #1
0
def infer(test_reader, window_size=5, use_cuda=False, model_path=None):
    """
    inference function
    """
    if model_path is None or not os.path.exists(model_path):
        print(str(model_path) + " cannot be found")
        return
    # get the reverse dict
    #      and define the index of interest word in the window
    #            (mast the same as index of train )
    reverse_word_dict = reverse_dict(word_dict)
    reverse_lbl_dict = reverse_dict(lbl_dict)
    interest_index = int(window_size / 2)

    # define the input layers
    data = fluid.layers.data(
        name="words", shape=[1], dtype="int64", lod_level=1)

    # init paddlepaddle
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    feeder = fluid.DataFeeder(feed_list=[data], place=place)
    inference_scope = fluid.core.Scope()

    with fluid.scope_guard(inference_scope):
        [inference_program, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
        for data_ in test_reader():
            # get the words index and words in char format
            words_index = [[d[0]] for d in data_]
            words = [reverse_word_dict[d[0][interest_index]] for d in data_]

            # use the infer to predict
            prediction = exe.run(inference_program,
                                 feed=feeder.feed(words_index),
                                 fetch_list=fetch_targets,
                                 return_numpy=True)

            # get the label tag and the prediction tag
            label_tag = [reverse_lbl_dict[d[1]] for d in data_]
            prediction_tag = [
                reverse_lbl_dict[p.argmax()] for p in prediction[0]
            ]

            # get the source string and prediction string of POS work
            source_POS = " ".join(
                ["/".join(items) for items in zip(words, label_tag)])
            prediction_POS = " ".join(
                ["/".join(items) for items in zip(words, prediction_tag)])

            # print the result for compare
            print("%s\ns_POS = %s\np_POS = %s" %
                  ("-" * 40, source_POS, prediction_POS))
Beispiel #2
0
    def load(self, path, features='BoW', match_avitm=True):
        if path[:2] == '~/':
            path = os.path.join(os.path.expanduser(path[:2]), path[2:])

        ### Specify the file locations
        train_path = path + '/train.npz'
        dev_path = path + '/dev.npz'
        test_path = path + '/test.npz'
        vocab_path = path + '/train.vocab.json'

        ### Load train
        train_csr = load_sparse(train_path)
        train = np.array(train_csr.todense()).astype('float32')

        ### Load dev
        self.dev_counts = load_sparse(dev_path).tocsc() # will be used for NPMI

        ### Load test
        test_csr = load_sparse(test_path)
        test = np.array(test_csr.todense()).astype('float32')

        ### load vocab
        # ENCODING = "ISO-8859-1"
        ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
             vocab_list = json.load(f)

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        return [train, None, test, None, None, None], [None, None, None], [vocab2dim, dim2vocab, None, None]
Beispiel #3
0
    def load(self,
             path='./data/wikitext-103',
             features='BoW',
             match_avitm=True):
        if path[:2] == '~/':
            path = os.path.join(os.path.expanduser(path[:2]), path[2:])

        ### Specify the file locations
        train_path = path + '/wikitext-103_tra.csr.npz'
        test_path = path + '/wikitext-103_test.csr.npz'
        vocab_path = path + '/vocab.txt'

        ### Load train
        train_csr = sparse.load_npz(train_path)
        train = np.array(train_csr.todense()).astype('float32')

        ### Load test
        test_csr = sparse.load_npz(test_path)
        test = np.array(test_csr.todense()).astype('float32')

        ### load vocab
        ENCODING = "ISO-8859-1"
        # ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
            vocab_list = [line.strip('\n') for line in f]

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        return [train, None, test, None, None,
                None], [None, None, None], [vocab2dim, dim2vocab, None, None]
Beispiel #4
0
    def load(self, path='./nytimes-pbr', features='BoW', match_avitm=False):
        if path[:2] == '~/':
            path = os.path.join(os.path.expanduser(path[:2]), path[2:])

        ### Specify the file locations
        train_path = path + '/input/data/train_tra/nytimes-pbr_tra.csr.npz'
        test_path = path + '/input/data/validation_np/validation_data.csr.npz'
        vocab_path = path + '/vocab.nytimes.txt'

        ### Load train
        # train_csr = sparse.load_npz(train_path)
        # train = np.array(train_csr.todense()).astype('float32')
        train = sparse.load_npz(train_path).astype('float32')
        train = mx.nd.sparse.csr_matrix(train, dtype='float32')

        ### Load test
        # test_csr = sparse.load_npz(test_path)
        # test = np.array(test_csr.todense()).astype('float32')
        test = sparse.load_npz(test_path).astype('float32')
        test = mx.nd.sparse.csr_matrix(test, dtype='float32')

        ### load vocab
        ENCODING = "ISO-8859-1"
        # ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
             vocab_list = [line.strip('\n') for line in f]

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        return [train, None, test, None, None, None], [None, None, None], [vocab2dim, dim2vocab, None, None]
Beispiel #5
0
    def load(self, path='~/20news_sklearn', features='BoW', match_avitm=True):
        if path[:2] == '~/':
            path = os.path.join(os.path.expanduser(path[:2]), path[2:])

        ### Specify the file locations
        train_path = path + '/train_sklearn_avitm.npy'
        train_labels_path = path + '/train_labels_sklearn_avitm.npy'
        test_path = path + '/test_sklearn_avitm.npy'
        test_labels_path = path + '/test_labels_sklearn_avitm.npy'
        vocab_path = path + '/vocab.txt'
        label_names_path = path + '/label_names.txt'

        ### Load train
        train = np.load(train_path).astype('float32')
        if train_labels_path:
            train_labels = np.load(train_labels_path)
        else:
            train_labels = None

        ### Load train
        test = np.load(test_path).astype('float32')
        if test_labels_path:
            test_labels = np.load(test_labels_path)
        else:
            test_labels = None

        ### load vocab
        ENCODING = "ISO-8859-1"
        # ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
             vocab_list = [line.strip('\n') for line in f]

        ### Load label names
        if label_names_path:
            with open(label_names_path, encoding=ENCODING) as f:
                label_name_list = [line.strip('\n') for line in f]
        else:
            label_name_list = None

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        topic2dim = dict(zip(label_name_list, range(len(label_name_list))))
        dim2topic = reverse_dict(topic2dim)

        return [train, None, test, None, None, None], [train_labels, None, test_labels], [vocab2dim, dim2vocab, topic2dim, dim2topic]
    def load(self, data_path, features='BoW', match_avitm=True):

        ### Specify the file locations
        train_path = data_path + '/train.npz'
        dev_path = data_path + '/dev.npz'
        test_path = data_path + '/test.npz'
        vocab_path = data_path + '/train.vocab.json'

        ### Load train
        train_csr = load_sparse(train_path)
        train_counts = np.array(train_csr.todense()).astype('float32')
        train_bert_logits = np.load(self.logit_path + "/train.npy")
        train = np.concatenate([train_counts, train_bert_logits], axis=1)

        if self.logit_clip is not None:
            # limit the document representations to the top k labels
            doc_tokens = np.sum(train_counts > 0, axis=1)
            vocab_size = train_counts.shape[1]

            for i, (row, total) in enumerate(zip(train_bert_logits,
                                                 doc_tokens)):
                k = self.logit_clip * total  # keep this many logits
                if k < vocab_size:
                    min_logit = np.quantile(row, 1 - k / vocab_size)
                    train_bert_logits[
                        i, train_bert_logits[i] < min_logit] = -np.inf

        #min_logits = np.quantile(train_bert_logits, np.quantile(train_counts.sum(1), 0.9) / 20_000, axis=1)
        #train_bert_logits[train_bert_logits < min_logits.reshape(-1, 1)] = -np.inf

        ### Load dev
        self.dev_counts = load_sparse(
            dev_path).tocsc()  # will be used for NPMI

        ### Load test
        test_csr = load_sparse(test_path)
        test_counts = np.array(test_csr.todense()).astype('float32')
        test_bert_logits = np.ones_like(test_counts)
        test = np.concatenate([test_counts, test_bert_logits], axis=1)

        ### load vocab
        # ENCODING = "ISO-8859-1"
        ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
            vocab_list = json.load(f)

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        return [train, None, test, None, None,
                None], [None, None, None], [vocab2dim, dim2vocab, None, None]
Beispiel #7
0
def get_display_states(component):
    """ get configured display states from trac.ini if configured
    [testmanager]
    passed = foo
    passed_comment = bar
    failed = doh
    skipped = n
    not_tested = -
    """
    states = dict(
        [option for option in component.config.options('testmanager')]
    )
    if not states or not len(states) == len(STATES_DISPLAY):
        return reverse_dict(STATES_DISPLAY)
    return states
    def __init__(self,
                 language="american_english",
                 min_word_size=3,
                 config="config.json",
                 print_search_progress=False):

        self.digit_map = utils.get_digit_map(file=config)
        self.char_map = utils.reverse_dict(self.digit_map)
        allowed_languages = utils.get_language_map(file=config)
        self.csp_solver = CspSolver(
            config=config,
            language=allowed_languages[language],
            print_search_progress=print_search_progress)

        self.min_word_size = min_word_size
    def load(self,
             path='./yelp_review_polarity_csv',
             features='BoW',
             match_avitm=False):
        if path[:2] == '~/':
            path = os.path.join(os.path.expanduser(path[:2]), path[2:])

        ### Specify the file locations
        train_path = path + '/yelp_review_polarity_csv_train.npz'
        val_path = path + '/yelp_review_polarity_csv_val.npz'
        test_path = path + '/yelp_review_polarity_csv_test.npz'
        vocab_path = path + '/vocab.txt'

        ### Load train
        # train_csr = sparse.load_npz(train_path)
        # train = np.array(train_csr.todense()).astype('float32')
        train = sparse.load_npz(train_path).astype('float32')
        train = mx.nd.sparse.csr_matrix(train, dtype='float32')

        ### Load val
        val = sparse.load_npz(val_path).astype('float32')
        val = mx.nd.sparse.csr_matrix(val, dtype='float32')

        ### Load test
        # test_csr = sparse.load_npz(test_path)
        # test = np.array(test_csr.todense()).astype('float32')
        test = sparse.load_npz(test_path).astype('float32')
        test = mx.nd.sparse.csr_matrix(test, dtype='float32')

        ### load vocab
        # ENCODING = "ISO-8859-1"
        ENCODING = "utf-8"
        with open(vocab_path, encoding=ENCODING) as f:
            vocab_list = [line.strip('\n') for line in f]

        # construct maps
        vocab2dim = dict(zip(vocab_list, range(len(vocab_list))))
        dim2vocab = reverse_dict(vocab2dim)

        return [train, val, test, None, None,
                None], [None, None, None], [vocab2dim, dim2vocab, None, None]
Beispiel #10
0
    def align_records(self, r1, r2):
        """
        partial align DOM tree list to another DOM tree list.

        e.g. (taken from [1]):

        >>> from lxml.html import fragment_fromstring
        >>> from .mdr import Record
        >>> pta = PartialTreeAligner()

        1. "flanked by 2 sibling nodes"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <e></e> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['a', 'b', 'c', 'd', 'e']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['b', 'c', 'd', 'e', 'p']

        2. "rightmost nodes"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <e></e> <f></f> <g></g> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['a', 'b', 'e', 'f', 'g']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['e', 'f', 'g', 'p']

        3. "leftmost nodes"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <f></f> <g></g> <a></a> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['f', 'g', 'a', 'b', 'e']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['a', 'f', 'g', 'p']

        4. "no unique insertion"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <a></a> <g></g> <e></e> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['a', 'b', 'e']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['a', 'e', 'p']

        5. "multiple unaligned nodes"
        >>> t1 = fragment_fromstring("<p> <x></x> <b></b> <d></d> </p>")
        >>> t2 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <h></h> <k></k> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['x', 'b', 'c', 'd', 'h', 'k']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['b', 'c', 'd', 'h', 'k', 'p']

        References
        ----------
        [1] Web Data Extraction Based on Partial Tree Alignment
        <http://dl.acm.org/citation.cfm?id=1060761>

        """
        alignment = self.sta.align_records(r1, r2)
        aligned = {alignment.first: alignment.second}

        for sub in alignment.subs:
            aligned[sub.first] = sub.second

        # add reverse mapping too
        reverse_aligned = reverse_dict(aligned)

        modified = False

        unaligned_elements = self.find_unaligned_elements(aligned, r2)
        for l in unaligned_elements:
            left_most = l[0]
            right_most = l[-1]

            prev_sibling = left_most.getprevious()
            next_sibling = right_most.getnext()

            if prev_sibling is None:
                if next_sibling is not None:
                    # leftmost alignment
                    next_sibling_match = reverse_aligned.get(next_sibling, None)
                    for i, element in enumerate(l):
                        element_copy = copy.deepcopy(element)
                        next_sibling_match.getparent().insert(i, element_copy)
                        aligned.update({element_copy: element})
                    modified = True
            elif next_sibling is None:
                # rightmost alignment
                prev_sibling_match = reverse_aligned.get(prev_sibling, None)
                previous_match_index = self._get_index(prev_sibling_match)
                # unique insertion
                for i, element in enumerate(l):
                    element_copy = copy.deepcopy(element)
                    prev_sibling_match.getparent().insert(previous_match_index + 1 + i, element_copy)
                    aligned.update({element_copy: element})
                modified = True
            else:
                # flanked by two sibling elements
                prev_sibling_match = reverse_aligned.get(prev_sibling, None)
                next_sibling_match = reverse_aligned.get(next_sibling, None)

                if prev_sibling_match is not None and next_sibling_match is not None:
                    next_match_index = self._get_index(next_sibling_match)
                    previous_match_index = self._get_index(prev_sibling_match)
                    if next_match_index - previous_match_index == 1:
                        # unique insertion
                        for i, element in enumerate(l):
                            element_copy = copy.deepcopy(element)
                            prev_sibling_match.getparent().insert(previous_match_index + 1 + i, element_copy)
                            aligned.update({element_copy: element})
                        modified = True
        return modified, len(unaligned_elements) > 0, aligned
Beispiel #11
0
def infer(test_reader, use_cuda=False, model_path=None):
    """
    inference function
    """
    if model_path is None or not os.path.exists(model_path):
        print(str(model_path) + " cannot be found")
        return
    # get the reverse dict
    #      and define the index of interest word in the window
    #            (mast the same as index of train )
    reverse_word_dict = reverse_dict(word_dict)

    # define the input layers
    hidden = fluid.layers.data(name="hidden", shape=[4096], dtype="float32")
    cell = fluid.layers.data(name="cell", shape=[4096], dtype="float32")
    pre_word = fluid.layers.data(name="pre_words", shape=[1], dtype="int64")

    # init paddlepaddle
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    feeder = fluid.DataFeeder(feed_list=[hidden, cell, pre_word], place=place)
    inference_scope = fluid.core.Scope()

    ##
    start_word_id = word_dict["__start__"]
    end_word_id = word_dict["__end__"]

    with fluid.scope_guard(inference_scope):
        [inference_program, feed_target_names,
         fetch_targets] = fluid.io.load_inference_model(model_path, exe)
        for data_ in test_reader():
            # get the words index and words in char format
            # words_index = [d[0] for d in data_]
            words = [reverse_word_dict[d] for d in data_[1]]

            img_feat, word_list = data_
            prev_hidden_, prev_cell_, prediction = img_feat, img_feat, start_word_id
            prediction_list = []
            ##
            # use the infer to predict
            for ii in range(MAX_LEN):
                if ii == 0:
                    # data_lstm = [(start_word_id, start_word_id, img_feat, img_feat)]
                    data_lstm = [[prev_hidden_, prev_cell_, start_word_id]]
                    prediction, prev_hidden, prev_cell = exe.run(
                        inference_program,
                        feed=feeder.feed(data_lstm),
                        fetch_list=fetch_targets,
                        return_numpy=True)

                    prediction = prediction[0].argmax()
                else:
                    # pre_words = word_list[ii - 1]
                    data_lstm = [[prev_hidden_, prev_cell_, prediction]]

                    prediction, prev_hidden, prev_cell = exe.run(
                        inference_program,
                        feed=feeder.feed(data_lstm),
                        fetch_list=fetch_targets,
                        return_numpy=True)
                    prediction = prediction[0].argmax()
                prediction_list.append(prediction)

                if prediction == end_word_id:
                    break

            prediction_tag = [reverse_word_dict[p] for p in prediction_list]

            prediction_words = " ".join(prediction_tag)
            source_words = " ".join(words)
            # print the result for compare
            print("%s\ns_POS = %s\np_POS = %s" %
                  ("-" * 40, source_words, prediction_words))
Beispiel #12
0
    def align_records(self, r1, r2):
        """
        partial align DOM tree list to another DOM tree list.

        e.g. (taken from [1]):

        >>> from lxml.html import fragment_fromstring
        >>> from .mdr import Record
        >>> pta = PartialTreeAligner()

        1. "flanked by 2 sibling nodes"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <e></e> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['a', 'b', 'c', 'd', 'e']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['b', 'c', 'd', 'e', 'p']

        2. "rightmost nodes"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <e></e> <f></f> <g></g> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['a', 'b', 'e', 'f', 'g']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['e', 'f', 'g', 'p']

        3. "leftmost nodes"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <f></f> <g></g> <a></a> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['f', 'g', 'a', 'b', 'e']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['a', 'f', 'g', 'p']

        4. "no unique insertion"
        >>> t1 = fragment_fromstring("<p> <a></a> <b></b> <e></e> </p>")
        >>> t2 = fragment_fromstring("<p> <a></a> <g></g> <e></e> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['a', 'b', 'e']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['a', 'e', 'p']

        5. "multiple unaligned nodes"
        >>> t1 = fragment_fromstring("<p> <x></x> <b></b> <d></d> </p>")
        >>> t2 = fragment_fromstring("<p> <b></b> <c></c> <d></d> <h></h> <k></k> </p>")
        >>> _, _, mapping = pta.align_records(Record(t1), Record(t2))
        >>> [e.tag for e in t1]
        ['x', 'b', 'c', 'd', 'h', 'k']
        >>> sorted([e.tag for e in mapping.itervalues()])
        ['b', 'c', 'd', 'h', 'k', 'p']

        References
        ----------
        [1] Web Data Extraction Based on Partial Tree Alignment
        <http://dl.acm.org/citation.cfm?id=1060761>

        """
        alignment = self.sta.align_records(r1, r2)
        aligned = {alignment.first: alignment.second}

        for sub in alignment.subs:
            aligned[sub.first] = sub.second

        # add reverse mapping too
        reverse_aligned = reverse_dict(aligned)

        modified = False

        unaligned_elements = self.find_unaligned_elements(aligned, r2)
        for l in unaligned_elements:
            left_most = l[0]
            right_most = l[-1]

            prev_sibling = left_most.getprevious()
            next_sibling = right_most.getnext()

            if prev_sibling is None:
                if next_sibling is not None:
                    # leftmost alignment
                    next_sibling_match = reverse_aligned.get(
                        next_sibling, None)
                    for i, element in enumerate(l):
                        element_copy = copy.deepcopy(element)
                        next_sibling_match.getparent().insert(i, element_copy)
                        aligned.update({element_copy: element})
                    modified = True
            elif next_sibling is None:
                # rightmost alignment
                prev_sibling_match = reverse_aligned.get(prev_sibling, None)
                previous_match_index = self._get_index(prev_sibling_match)
                # unique insertion
                for i, element in enumerate(l):
                    element_copy = copy.deepcopy(element)
                    prev_sibling_match.getparent().insert(
                        previous_match_index + 1 + i, element_copy)
                    aligned.update({element_copy: element})
                modified = True
            else:
                # flanked by two sibling elements
                prev_sibling_match = reverse_aligned.get(prev_sibling, None)
                next_sibling_match = reverse_aligned.get(next_sibling, None)

                if prev_sibling_match is not None and next_sibling_match is not None:
                    next_match_index = self._get_index(next_sibling_match)
                    previous_match_index = self._get_index(prev_sibling_match)
                    if next_match_index - previous_match_index == 1:
                        # unique insertion
                        for i, element in enumerate(l):
                            element_copy = copy.deepcopy(element)
                            prev_sibling_match.getparent().insert(
                                previous_match_index + 1 + i, element_copy)
                            aligned.update({element_copy: element})
                        modified = True
        return modified, len(unaligned_elements) > 0, aligned
Beispiel #13
0
import pandas as pd
import numpy as np
import snap
import community
import random

G, G_weighted = utils.loadGraphs()

R_t, G_2, layers_2 = utils.hicode(G_weighted, 2)
print("hicode 2", R_t)
nx.write_weighted_edgelist(G_weighted, 'results/G2_weighted.edgelist')

nodes_mapping = utils.load_nodes_mapping()
partitions = []
for num_layer, layer in enumerate(layers_2):
    print("community_count", len(layer))
    revised_community_count = 0
    for i, subgraph in enumerate(layer):
        if len(subgraph.nodes) > 100:
            revised_community_count += 1
    print("revised_community_count", revised_community_count)

    partition = utils.layer_to_partition(layer, G_weighted)
    print("layer number ", num_layer + 1)
    print(utils.modularity(partition, G_weighted))
    print(utils.modularity(partition, G_2))
    partitions.append(partition)
    reverse_comms = utils.reverse_dict(partition)
    utils.write_results_to_file(reverse_comms, nodes_mapping,
                                "layer_" + str(num_layer + 1))
Beispiel #14
0
def exportBio2RDFFeature():
    fin = open(const.BIO2RDF_DRUG_TRIPLE_PATH)
    featureMap = dict()
    featureCount = dict()

    dDrug2Bio2RDFFeature = dict()
    currentDrug = ""
    currentBio2RDFFeature = []
    while True:
        line = fin.readline()
        if line == "":
            #fout.write("%s|%s\n" % (currentDrug, ",".join(int2StringArray(currentBio2RDFFeature))))
            dDrug2Bio2RDFFeature[currentDrug] = currentBio2RDFFeature
            break
        parts = line.strip().split("\t")
        if len(parts) != 3:
            print("Error")
            print(line)
            exit(-1)
        drugId = parts[0]
        if drugId != currentDrug:
            if currentDrug != "":
                #fout.write("%s|%s\n" % (currentDrug, ",".join(int2StringArray(currentBio2RDFFeature))))
                dDrug2Bio2RDFFeature[currentDrug] = currentBio2RDFFeature

            currentDrug = drugId
            currentBio2RDFFeature = []
        predicate = parts[1]
        obj = parts[2]

        isSkipped = False
        for skipPattern in PREDICATE_SKIP_PATTERNS:
            if predicate.__contains__(skipPattern):
                isSkipped = True
                break

        if isSkipped:
            continue

        feature = "%s|%s" % (predicate, obj)
        featureId = utils.get_update_dict_index(featureMap, feature)
        utils.add_dict_counter(featureCount, featureId)
        currentBio2RDFFeature.append(featureId)

    fin.close()

    #sorted = utils.sort_dict(featureCount)
    #print (sorted[-10:])

    newFeatureMap = dict()
    for featureId, cout in featureCount.items():
        if cout < MIN_FEATURE_COUNT:
            continue
        utils.get_update_dict_index(newFeatureMap, featureId)
    print("After filtering: ", len(newFeatureMap))

    fout = open(const.BIO2RDF_FEATURE_PATH, "w")

    for drugId, features in dDrug2Bio2RDFFeature.items():
        newFeatureAr = []
        for feature in features:
            newFeatureId = utils.get_dict(newFeatureMap, feature, -1)
            if newFeatureId != -1:
                newFeatureAr.append(newFeatureId)

        strArr = int2StringArray(newFeatureAr)
        fout.write("%s|%s\n" % (drugId, ",".join(strArr)))
    fout.close()

    fout = open("%s_Feature" % const.BIO2RDF_FEATURE_PATH, "w")

    revertNewFeatureMap = utils.reverse_dict(newFeatureMap)
    revertOldFeatuerMap = utils.reverse_dict(featureMap)
    for newFeatureMapId, oldFeatureMapId in revertNewFeatureMap.items():
        fout.write("%s|%s\n" %
                   (newFeatureMapId, revertOldFeatuerMap[oldFeatureMapId]))
    fout.close()
    fout = open(const.BIO2RDF_INFO, "w")
    fout.write("Num feature: %s\n" % len(newFeatureMap))
    fout.close()
        coord.join(threads)


def write_dict():
    cs = open("resource/gb2312_list.txt", 'r').read()
    index = 134
    with open("resource/new_dic2.txt", 'a') as f:
        for c in cs:
            f.write("%d\t%c\n" % (index, c))
            index = index + 1


# python gen_record_crnn.py --dataset_name=train --dataset_dir=out --dataset_nums=10000 --output_dir=datasets/vgg_train
if __name__ == '__main__':
    chinese_dict = read_dict(FLAGS.dict_text)
    chinese_dict_ids = reverse_dict(chinese_dict)
    # print([chinese_dict[code] for code in "你好呀!"])
    # print([chinese_dict_ids[code] for code in [chinese_dict[code] for code in "你好呀!"]])
    # make_tfrecord2(chinese_dict, FLAGS.dataset_name, FLAGS.dataset_nums)

    # write_dict()
    # words = open("resource/gb2312_list.txt", 'r').read()
    # print(words)

    parse_tfrecord_file()

    #
    # import datasets

    # print(getattr(datasets, "my_data"))