Example #1
0
    def load_corpus(self, corpus_dir):
        """Load corpus from a given directory, then initialize the documents
        and model.
        Line format: token1 \t token2 \t token3 \t ... ...
        """
        self.documents = []
        rand = random.Random()

        logging.info('Load corpus from %s.' % corpus_dir)
        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                logging.info('Load filename %s.' % filename)
                fp = open(filename, 'r')
                for doc_str in fp.readlines():
                    doc_str = doc_str.decode('gbk')
                    doc_tokens = doc_str.strip().split('\t')
                    if len(doc_tokens) < 2:
                        continue
                    document = Document(self.model.num_topics)
                    document.parse_from_tokens(doc_tokens, rand,
                                               self.vocabulary)
                    if document.num_words() < 2:
                        continue
                    self.documents.append(document)
                fp.close()

        logging.info('The document number is %d.' % len(self.documents))
        self._initialize_model()

        self._compute_smoothing_only_bucket()
        self._initialize_topic_word_coefficient()
    def load_corpus(self, corpus_dir):
        """Load corpus from a given directory, then initialize the documents
        and model.
        Line format: token1 \t token2 \t token3 \t ... ...
        """
        self.documents = []
        rand = random.Random()

        logging.info('Load corpus from %s.' % corpus_dir)
        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                logging.info('Load filename %s.' % filename)
                fp = open(filename, 'r')
                for doc_str in fp.readlines():
                    doc_str = doc_str.decode('gbk')
                    doc_tokens = doc_str.strip().split('\t')
                    if len(doc_tokens) < 2:
                        continue
                    document = Document(self.model.num_topics)
                    document.parse_from_tokens(doc_tokens, rand, self.vocabulary)
                    if document.num_words() < 2:
                        continue
                    self.documents.append(document)
                fp.close()

        logging.info('The document number is %d.' % len(self.documents))
        self._initialize_model()

        self._compute_smoothing_only_bucket()
        self._initialize_topic_word_coefficient()
Example #3
0
    def _inference_one_chain(self, doc_tokens, rand):
        """Inference topics with one markov chain.

        Returns the sparse topics p(z|d).
        """
        document = Document(self.model.num_topics)
        document.parse_from_tokens(doc_tokens, rand, self.vocabulary,
                                   self.model)
        if document.num_words() == 0:
            return dict()

        accumulated_topic_hist = {}
        for i in xrange(self.total_iterations):
            # one iteration
            for word in document.get_words():
                # --
                document.decrease_topic(word.topic, 1)

                new_topic = self._sample_word_topic(document, word.id, rand)
                assert new_topic != None
                word.topic = new_topic
                # ++
                document.increase_topic(new_topic, 1)

            if i >= self.burn_in_iterations:
                for non_zero in document.doc_topic_hist.get_non_zeros():
                    if non_zero.topic in accumulated_topic_hist:
                        accumulated_topic_hist[
                            non_zero.topic] += non_zero.count
                    else:
                        accumulated_topic_hist[non_zero.topic] = non_zero.count

        topic_dist = self._l1normalize_distribution(accumulated_topic_hist)
        return topic_dist
    def _inference_one_chain(self, doc_tokens, rand):
        """Inference topics with one markov chain.

        Returns the sparse topics p(z|d).
        """
        document = Document(self.model.num_topics)
        document.parse_from_tokens(doc_tokens, rand,
                self.vocabulary, self.model)
        if document.num_words() == 0:
            return dict()

        accumulated_topic_hist = {}
        for i in xrange(self.total_iterations):
            # one iteration
            for word in document.get_words():
                # --
                document.decrease_topic(word.topic, 1)

                new_topic = self._sample_word_topic(document, word.id, rand)
                assert new_topic != None
                word.topic = new_topic
                # ++
                document.increase_topic(new_topic, 1)

            if i >= self.burn_in_iterations:
                for non_zero in document.doc_topic_hist.get_non_zeros():
                    if non_zero.topic in accumulated_topic_hist:
                        accumulated_topic_hist[non_zero.topic] += non_zero.count
                    else:
                        accumulated_topic_hist[non_zero.topic] = non_zero.count

        topic_dist = self._l1normalize_distribution(accumulated_topic_hist)
        return topic_dist
Example #5
0
 def test_compute_loglikelihood(self):
     doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
             'mac os x', 'chrome',  # only exist in vocabulary
             'nokia', 'null']  # inexistent
     document = Document(self.model.num_topics)
     rand = random.Random()
     rand.seed(0)
     document.parse_from_tokens(
             doc_tokens, rand, self.vocabulary, self.model)
     documents = [document, document]
     self.assertEqual(-14.113955684239654,
             self.model_evaluator.compute_loglikelihood(documents))
Example #6
0
    def get_document(self):

        self.__doc = Document(text=self.__text)

        self.__doc.set_emails(self.__emails)
        self.__doc.set_links(self.__links)
        self.__doc.set_hash_tags(self.__hash_tags)

        self.__doc.set_tokens(self.__tokens)

        self.__doc.set_sentences(list(self.__blob.sentences))

        return self.__doc
    def _load_corpus(self, corpus_dir):
        self.documents = []
        if not os.path.exists(corpus_dir):
            logging.error('The corpus directory %s does not exists.'
                    % corpus_dir)
            return False

        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                fp = open(filename, 'rb')
                record_reader = RecordReader(fp)
                while True:
                    blob = record_reader.read()
                    if blob == None:
                        break
                    document = Document(self.model.num_topics)
                    document.parse_from_string(blob)
                    self.documents.append(document)

        return True
Example #8
0
    def _load_corpus(self, corpus_dir):
        self.documents = []
        if not os.path.exists(corpus_dir):
            logging.error('The corpus directory %s does not exists.' %
                          corpus_dir)
            return False

        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                fp = open(filename, 'rb')
                record_reader = RecordReader(fp)
                while True:
                    blob = record_reader.read()
                    if blob == None:
                        break
                    document = Document(self.model.num_topics)
                    document.parse_from_string(blob)
                    self.documents.append(document)

        return True
Example #9
0
def load_corpus(instance_folder_path: str,
                non_instance_folder_path: str) -> Corpus:
    files = [(file, DocumentClass.INSTANCE) for file in listdir(instance_folder_path) if isfile(join(instance_folder_path, file))] \
        + [(file, DocumentClass.NON_INSTANCE) for file in listdir(non_instance_folder_path) if isfile(join(non_instance_folder_path, file))]
    documents = []
    for file, is_instance in files:
        folder_path = instance_folder_path if is_instance == DocumentClass.INSTANCE else non_instance_folder_path
        file_path = join(folder_path, file)
        with open(file_path, 'r') as document:
            doc = Document(document.read(), is_instance)
            documents.append(doc)

    return Corpus(documents)
Example #10
0
def load_documents(folder_path: str) -> [Document]:
    files = [
        file for file in listdir(folder_path)
        if isfile(join(folder_path, file))
    ]

    documents = []
    for file in files:
        with open(join(folder_path, file), 'r') as document:
            doc = Document(document.read(), DocumentClass.UNKNOWN)
            documents.append(doc)

    return documents
Example #11
0
def read_corenlp_doc(filename, verbose=True):
    if verbose:
        log.info('Reading CoreNLP document from {}'.format(filename))

    input_xml = smart_file_handler(filename)

    xml_parser = etree.XMLParser(target=CoreNLPTarget())
    sents, corefs = etree.parse(input_xml, xml_parser)
    doc_name = splitext(basename(filename))[0]
    doc = Document.construct(doc_name, sents, corefs)

    input_xml.close()

    return doc
Example #12
0
def read_doc_from_ontonotes(coref_doc, name_doc, verbose=True):
    doc_id = coref_doc.document_id.split('@')[0]
    assert doc_id == name_doc.document_id.split('@')[0], \
        '{} and {} do not have the same document_id'.format(coref_doc, name_doc)

    if verbose:
        log.info('Reading ontonotes document {}'.format(doc_id))

    conll_file_path = join(ontonotes_annotations_source, doc_id + '.depparse')

    all_sents = read_conll_depparse(conll_file_path)

    all_corefs = read_coref_doc(coref_doc)

    doc_name = doc_id.split('/')[-1]
    doc = Document.construct(doc_name, all_sents, all_corefs)

    for name_entity in read_name_doc(name_doc):
        add_name_entity_to_doc(doc, name_entity)

    return doc
Example #13
0
class GraphematicalAnalysis:
    def __init__(self, text=None):
        self.__text = text

        self.__blob = None

        self.__tokens = []

        self.__DEL = [' ', '  ', '    ', '\t', '\n']
        self.__SIG = [
            '.', ',', '-', '—', '!', '?', ';', ':', '(', ')', '[', ']', '{',
            '}'
        ]
        self.__SYM = ['«', '»', '\"', '\"', '\"', '``', '\'\'']
        self.__RLE = [
            'й', 'ц', 'у', 'к', 'е', 'н', 'г', 'ш', 'щ', 'з', 'х', 'ъ', 'ф',
            'ы', 'в', 'а', 'п', 'р', 'о', 'л', 'д', 'ж', 'э', 'я', 'ч', 'с',
            'м', 'и', 'т', 'ь', 'б', 'ю', 'ё', 'Й', 'Ц', 'У', 'К', 'Е', 'Н',
            'Г', 'Ш', 'Щ', 'З', 'Х', 'Ъ', 'Ф', 'Ы', 'В', 'А', 'П', 'Р', 'О',
            'Л', 'Д', 'Ж', 'Э', 'Я', 'Ч', 'С', 'М', 'И', 'Т', 'Ь', 'Б', 'Ю',
            'Ё'
        ]

        self.__LLE = [
            'q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p', 'a', 's', 'd',
            'f', 'g', 'h', 'j', 'k', 'l', 'z', 'x', 'c', 'v', 'b', 'n', 'm',
            'Q', 'W', 'E', 'R', 'T', 'Y', 'U', 'I', 'O', 'P', 'A', 'S', 'D',
            'F', 'G', 'H', 'J', 'K', 'L', 'Z', 'X', 'C', 'V', 'B', 'N', 'M'
        ]

        self.__DC = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
        self.__END = ['.', '!', '?']

        self.__emails = []
        self.__hash_tags = []
        self.__links = []

        self.__tokenization_result = []

        self.__doc = None

        self.__regexp_dir = os.getcwd() + '/common/'

    def __pre_processing(self):

        # Clear all previous results

        self.__tokens = []
        self.__tokenization_result = []

        # Extract all of emails & replace them with '__EMAIL'
        self.__emails = self.extract_email_addresses(self.__text)
        for email in self.__emails:
            self.__text = self.__text.replace(email, '__EMAIL')

        # Extract all of hashtags & replace them with '__HASHTAG'
        self.__hash_tags = self.extract_hash_tags(self.__text)
        for tag in self.__hash_tags:
            self.__text = self.__text.replace(tag, '__HASHTAG')

        # Extract all of links & replace them with '__LINK'
        self.__links = self.extract_links(self.__text)
        for link in self.__links:
            self.__text = self.__text.replace(link, '__LINK')

        self.__blob = TextBlob(self.__text)

        # Replacing quotes like '``' with "\""
        for token in self.__blob.tokens:
            new_token = str(token)
            if new_token.startswith('``'):
                new_token = "\""
            elif new_token.endswith('\'\''):
                new_token = "\""
            self.__tokenization_result.append(new_token)

        # Removing unicode special character in first token
        # self.__tokenization_result[0] = self.__tokenization_result[0][1:]

        quotes = OrderedDict()

        # Searching for quotes, deleting them & remember their positions
        for i in range(0, len(self.__tokenization_result)):
            s = str(self.__tokenization_result[i])
            if self.__tokenization_result[i].startswith(
                    "«") or self.__tokenization_result[i].startswith("\""):

                if len(self.__tokenization_result[i]) > 2:
                    # Remember first symbol
                    quotes[i + len(quotes)] = self.__tokenization_result[i][0]

                    # Delete first symbol
                    self.__tokenization_result[i] = self.__tokenization_result[
                        i][1:]

            if self.__tokenization_result[i].endswith(
                    "»") or self.__tokenization_result[i].endswith("\""):

                if len(self.__tokenization_result[i]) > 2:
                    # Remember last symbol
                    quotes[i + len(quotes) +
                           1] = self.__tokenization_result[i][
                               len(self.__tokenization_result[i]) - 1]

                    # Delete last symbol
                    self.__tokenization_result[i] = self.__tokenization_result[
                        i][0:-1]

        # Inserting quotes as individual q
        for key in quotes.keys():
            self.__tokenization_result.insert(key, quotes[key])

    def analysis(self):

        self.__pre_processing()

        current_email = 0
        current_hash_tag = 0
        current_link = 0

        # Add descriptors & labels for each token in the text
        for raw_token in self.__tokenization_result:

            if raw_token == '__EMAIL':
                raw_token = self.__emails[current_email]
                current_email += 1
            elif raw_token == '__HASHTAG':
                raw_token = self.__hash_tags[current_hash_tag]
                current_hash_tag += 1
            elif raw_token == '__LINK':
                raw_token = self.__links[current_link]
                current_link += 1

            if self.index_of_any(raw_token, self.__DEL):
                # Delimiter
                token = Token(text=raw_token, grapheme=Grapheme.DEL)

                # labels
                token.add_label(Label.SPACE)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__RLE):
                # Russian lexeme
                token = Token(text=raw_token, grapheme=Grapheme.RLE)

                # labels
                token.add_label(Label.WORD)
                token.add_label(Label.CYRIL)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__SYM):
                # Symbol
                token = Token(text=raw_token, grapheme=Grapheme.SYM)

                # labels
                token.add_label(Label.QUOTE)
                token.add_label(Label.MARKUP)

                if raw_token == "«" or raw_token == "\"":
                    token.add_label(Label.OPENING)
                elif raw_token == "»" or raw_token == "\"":
                    token.add_label(Label.CLOSING)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__LLE):
                # Latin lexeme
                token = Token(text=raw_token, grapheme=Grapheme.LLE)

                # labels
                token.add_label(Label.WORD)
                token.add_label(Label.LATIN)

                self.__tokens.append(token)

            elif self.index_of_any(raw_token, self.__DC):
                # Digits complex
                token = Token(text=raw_token, grapheme=Grapheme.DC)

                # labels
                token.add_label(Label.NUMBER)

                self.__tokens.append(token)

            else:
                if self.index_of_any(raw_token, self.__SIG):
                    # Signum
                    token = Token(text=raw_token, grapheme=Grapheme.SIG)

                    # labels
                    token.add_label(Label.PUNCT)

                    if raw_token == "(" or raw_token == "[" or raw_token == '{':
                        token.add_label(Label.OPENING)
                    elif raw_token == ")" or raw_token == "]" or raw_token == '}':
                        token.add_label(Label.CLOSING)

                    self.__tokens.append(token)
                else:
                    # Composite token TODO: add #hashtag, email, phone labels
                    token = Token(text=raw_token, grapheme=Grapheme.COMPOSITE)

                    # labels
                    if raw_token in self.__emails:
                        token.add_label(Label.EMAIL)
                    elif raw_token in self.__hash_tags:
                        token.add_label(Label.HASHTAG)
                    elif raw_token in self.__links:
                        token.add_label(Label.LINK)
                    elif self.is_word_with_a_hyphen(raw_token):
                        token.add_label(Label.WORD)
                        token.add_label(Label.CYRIL)
                    else:
                        token.add_label(Label.OTHER)

                    self.__tokens.append(token)

            space_token = Token(text=" ", grapheme=Grapheme.DEL)
            space_token.add_label(Label.SPACE)

            # self.__tokens.append(space_token)

        return self.__tokens

    @staticmethod
    def index_of_any(source, dictionary):
        for i in range(0, len(source)):
            if source[i] not in dictionary:
                return False
        return True

    @staticmethod
    def intersects(source, dictionary):
        for i in range(0, len(source)):
            if source[i] in dictionary:
                return True
        return False

    @staticmethod
    def extract_email_addresses(string):

        r = re.compile(r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-z]{2,5})")
        return r.findall(string)

    @staticmethod
    def is_word_with_a_hyphen(string):

        r = re.findall(r'[\w-]+[\w-]', string)
        return len(r) > 0

    def extract_links(self, string):

        with open(self.__regexp_dir + 'link_regexp.txt', 'r') as f:
            pattern = f.read()

        r = re.compile(pattern)
        return r.findall(string)

    @staticmethod
    def extract_hash_tags(string):
        r = re.compile(r'#\w*')
        return r.findall(string)

    def set_text(self, text):
        self.__text = text

    def get_tokens(self):
        # self.__tokens.pop()
        return self.__tokens

    def get_emails(self):
        return self.__emails

    def get_links(self):
        return self.__links

    def get_hash_tags(self):
        return self.__hash_tags

    def get_document(self):

        self.__doc = Document(text=self.__text)

        self.__doc.set_emails(self.__emails)
        self.__doc.set_links(self.__links)
        self.__doc.set_hash_tags(self.__hash_tags)

        self.__doc.set_tokens(self.__tokens)

        self.__doc.set_sentences(list(self.__blob.sentences))

        return self.__doc