Python Document.Documentの例、document.Document.Document Pythonの例

コード例 #1

0

ファイルを表示

ファイル: eva.py プロジェクト: andrewccchan/ML2016_final

def predict(model, meanX, stdX):

    # Load test data. Modified from loadTrainData
    header = ["\"id\"", "\"tags\""]
    outFile = open("../submit.csv", "w")
    outFile.write(",".join(header) + "\n")

    with open("../data/test.csv", "r") as inFile:
        csvReader = csv.reader(inFile, quotechar='"', delimiter=',',
                        quoting=csv.QUOTE_ALL, skipinitialspace=True)
        next(csvReader, None) # skip header
        docs = [Document(l, "test") for l in csvReader]
        totalCount = util.calTotalCount(docs)
        for d in range(len(docs)):
            invVoc = {v: k for k, v in docs[d].vocab.iteritems()}
            docs[d].addTFIDF(totalCount, len(docs), True)
            fea = docs[d].getFeatures()
            fea, meanX, stdX = util.featureNorm(fea, meanX, stdX)
            # docs[d].debug()
            # labels = model.predict_classes(fea, batch_size=1, verbose=0)
            # print labels
            labels = model.predict_proba(fea, batch_size=1, verbose=0)
            # print labels
            posIdx = []
            posProb = []
            for ct1 in range(labels.shape[0]):
                if labels[ct1, 0] < labels[ct1, 1]:
                    posIdx.append(ct1)
                    posProb.append(labels[ct1, 1])
            sortIdx = np.argsort(posProb).tolist()
            realIdx = [posIdx[i] for i in sortIdx]
            tags = []

            realLen = len(realIdx)
            if realLen == 1:
                tags.append(invVoc[realIdx[-1]])
            elif realLen == 2:
                tags.append(invVoc[realIdx[-1]])
                tags.append(invVoc[realIdx[-2]])
            elif realLen > 2:
                tags.append(invVoc[realIdx[-1]])
                for ct1 in range(2):
                    tmp = -2 - ct1
                    if labels[realIdx[tmp]][1] > 0.8:
                        tags.append(invVoc[realIdx[tmp]])
            outFile.write("\"%d\"," % (docs[d].docId))
            outFile.write("\"" + " ".join(tags) + "\"\n")

    outFile.close()

コード例 #2

0

ファイルを表示

def sign_and_store_document(rein,
                            doc_type,
                            document,
                            signature_address=None,
                            signature_key=None,
                            store=True):
    """
    Save document if no signature key provided. Otherwise sign document, then validate and store it.
    """
    validated = False
    if signature_key is None:  # signing will happen outside app
        f = open(doc_type + '.txt', 'w')
        f.write(document)
        f.close()
        click.echo("\n%s\n" % document)
        done = False
        while not done:
            filename = click.prompt("File containing signed document",
                                    type=str,
                                    default=doc_type + '.sig.txt')
            if os.path.isfile(filename):
                done = True
        f = open(filename, 'r')
        signed = f.read()
        res = validate_enrollment(signed)
        if res:
            validated = True
    else:  # sign with stored delegate key
        signature = sign(signature_key, document)
        validated = verify(signature_address, document, signature)

    if validated:
        # insert signed document into documents table
        b = "-----BEGIN BITCOIN SIGNED MESSAGE-----"
        c = "-----BEGIN SIGNATURE-----"
        d = "-----END BITCOIN SIGNED MESSAGE-----"
        signed = "%s\n%s\n%s\n%s\n%s\n%s" % (b, document, c, signature_address,
                                             signature, d)
        click.echo('\n' + signed + '\n')
        if store:
            d = Document(rein,
                         doc_type,
                         signed,
                         sig_verified=True,
                         testnet=rein.testnet)
            rein.session.add(d)
            rein.session.commit()
        return d
    return validated

コード例 #3

0

ファイルを表示

ファイル: parser_module.py プロジェクト: GuyShimony/SearchEngine

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list representing the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        self.tweet_id = tweet_id
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        if type(
                url
        ) is float:  # Handle the bug that some urls are read as 'nan' float
            url = "{}"
        url = url.replace("{", "").replace("}", "").replace('"', "").replace(
            "[", "").replace("]", "")
        retweet_text = doc_as_list[4]
        retweet_url = doc_as_list[5]
        quote_text = doc_as_list[6]
        quote_url = doc_as_list[7]

        url = url.replace("{", "").replace("}", "").replace('"', "").replace(
            "[", "").replace("]", "")
        if url:
            urls_index = [m.start() for m in re.finditer('http', url)
                          ]  # Find all start index of the http word
            urls = [url[:i - 1] if i - 1 > 0 else url[:i] for i in urls_index
                    ] + [url[urls_index[-1]:]]  # Match all url
            url = "".join(
                w + " "
                for w in urls)  # Join on all urls with spaces as a separator
            url_dict = self.parse_sentence(url)
        else:
            url_dict = {}

        full_text_dict = self.parse_sentence(full_text)

        # Merge all dict objects to one with dictionaries unpacking
        term_dict = {**full_text_dict, **url_dict}

        # doc_length = len(term_dict)  # after text operations.
        doc_length = sum(term_dict.values())  # after text operations.
        # To avoid tweets that do not follow any parsing rule. For example the full text is 'same' (stop word)
        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length)

        return document

コード例 #4

0

ファイルを表示

    def __init__(self, dir):
        """
        Constructs a new SearchEngine object given a directory name.

        """
        self._docs = dict()
        self._dir = dir
        self._num_docs = 0
        for file in os.listdir(dir):
            self._num_docs += 1
            doc = Document(dir + '/' + file)
            for word in doc.get_words():
                if word not in self._docs:
                    self._docs[word] = []
                self._docs[word].append(dir + '/' + file)

コード例 #5

0

ファイルを表示

 def generate_docs(self):
     if (self.__size == 'B'):
         numberDoc = random.randint(3, 5)
     elif (self.__size == 'S'):
         numberDoc = random.randint(1, 3)
     for d in range(numberDoc):
         format = random.choice(['cheque', 'invoice', 'pdf'])
         if (format == 'cheque'):
             rect = pygame.Rect(self.__rect.x, self.__rect.y,
                                GLOBAL.CHEQUE_WIDTH, GLOBAL.CHEQUE_HEIGHT)
         else:
             rect = pygame.Rect(self.__rect.x, self.__rect.y,
                                GLOBAL.DOC_WIDTH, GLOBAL.DOC_HEIGHT)
         doc = Document(format, rect)
         self.__docs.append(doc)

コード例 #6

0

ファイルを表示

ファイル: app.py プロジェクト: xingfanxia/inverted-index.py

def pipeline():
    """Build inverted index pipeline."""
    # read docs.
    docs = util.get_docs()
    # init connector
    r_p, r_d, r_o = util.redis_init()
    # build
    for f in tqdm(docs):
        doc = Document(f, r_p, r_d)
        doc_terms = doc.terms
        doc.store(doc_terms)
        # store global statis, num of documents
        # and each document length
        r_o.set('num_docs', len(docs))
        r_o.set(doc.name, len(doc_terms))

コード例 #7

0

ファイルを表示

ファイル: corpus.py プロジェクト: ehutt/text2topics

def load_clean_corpus(clean_path):
    clean_file = clean_path + 'clean.pkl'
    clean_text = pickle.load(open(clean_file, "rb"))
    clean_docs = list()
    for text in clean_text:
        doc = Document(text)
        clean_docs.append(doc)
    clean_corpus = Corpus(clean_docs)
    print('Loaded clean docs.')
    vector_file = clean_path + 'corpus_vectors.pkl'
    if os.path.exists(vector_file):
        vectors = pickle.load(open(vector_file, "rb"))
        clean_corpus.vectors = vectors
        print('Loaded corpus vectors.')
    return clean_corpus

コード例 #8

0

ファイルを表示

def raster_test():
    doc = Document("Untitled-1")

    width = 100
    height = 100
    image = Image.new("L", (width, height))
    draw = ImageDraw.Draw(image)
    draw_circle(draw, width / 2, height / 2, width / 2 - 10, 255)
    draw_circle(draw, width * 3 / 4, height / 4, width / 5, 0)
    image.save("prntest.png")

    raster = Raster(image, 100, 100, 100, 100)
    doc.addRaster(raster)

    return doc

コード例 #9

0

ファイルを表示

ファイル: document_test.py プロジェクト: JoshuaWhittemore/Gap

 def test_018(self):
     """ Document document setter - valid PDF file with page directory """
     document = Document()
     document.dir = "tests"
     document.document = "tests/4page.pdf"
     self.assertEqual(document.name, "4page")
     self.assertEqual(len(document), 4)
     for i in range(1,5):
         self.assertTrue(os.path.isfile("tests/4page" + str(i) + ".pdf"))
         self.assertTrue(os.path.isfile("tests/4page" + str(i) + ".txt"))
         self.assertTrue(os.path.isfile("tests/4page" + str(i) + ".json"))
     for i in range(1,5):
         os.remove("tests/4page" + str(i) + ".pdf")
         os.remove("tests/4page" + str(i) + ".txt")
         os.remove("tests/4page" + str(i) + ".json")

コード例 #10

0

ファイルを表示

 def callbackContinue(self, response):
     result = response.result()
     if result is None:
         self.queue.get()
     else:
         url = result.url
         soup = BeautifulSoup(result.content, "lxml")
         if result.status_code == 200:
             doc = Document(url, parseInfo(soup))
             self.documents.append(doc)
             self.findLinks(soup)
             self.queue.get()
         else:
             self.findLinks(soup)
             self.queue.get()

コード例 #11

0

ファイルを表示

ファイル: base.py プロジェクト: selim-karaduman/trec-covid-IR

 def process_corpus(self, corpus):
     docs = []
     idf = []
     cord_uids = set()
     self.doc_ids = []
     self.word2id = dict()
     self.posting_list = dict()
     self.d_avg = 0
     # posting_list: key: string, value: set
     word_index = 0
     doc_index = 0
     for i in range(len(corpus)):
         cord_uid = corpus["cord_uid"][i]
         if cord_uid in cord_uids:
             continue
         title = corpus["title"][i]
         title = "" if (not isinstance(title, str)) else title
         abstract = corpus["abstract"][i]
         abstract = "" if (not isinstance(abstract, str)) else abstract
         text = title + " " + abstract
         tokenized_text = self.process_text(text)
         # if the document is very short, skip it
         if len(tokenized_text) < THRESHOLD_MIN_TOKEN:
             continue
         doc = Document(tokenized_text)
         self.d_avg += len(tokenized_text)
         for word in tokenized_text:
             # add word to dictionary
             if word not in self.word2id:
                 self.word2id[word] = word_index
                 idf.append(0)
                 word_index += 1
             # add doc to posting_list
             if word not in self.posting_list:
                 self.posting_list[word] = set()
             self.posting_list[word].add(doc_index)
         docs.append(doc)
         self.doc_ids.append(cord_uid)
         cord_uids.add(cord_uid)
         doc_index += 1
         for word in doc.tf_dict.keys():
             index = self.word2id[word]
             idf[index] += 1
         if i % 100 == 0:
             print("{} / {}; {:.2f} %".format(i, len(corpus),
                                              i / len(corpus) * 100))
     self.d_avg /= len(self.doc_ids)
     return idf, docs

コード例 #12

0

ファイルを表示

ファイル: doc.py プロジェクト: yimuchens/doc

    def analysisHtml(self, f):
        document = Document()
        text = f.readline()
        while text:
            if text.startswith("<body"):
                document.documentWidth = int(
                    self.analysisStyle(text)["width"][:-2])  # 去掉px字符
            if text.startswith("<title"):
                document.title = self.analysisText(text)
            if text.startswith("<h1") or text.startswith(
                    "<h2") or text.startswith("<h3") or text.startswith("<h4"):
                block = document.addTextBlock()
                text_ = self.analysisText(text)  # 文本内容

                block.addTextItem(text_, preTextItem=None)
                exec("block.setTitleLevel_(GlobalVars.T{})".format(text[2]))
            if text.startswith("<p"):
                block = document.addTextBlock()
            if text.startswith("<span"):
                attr = self.analysisStyle(text)  # 属性
                text = self.analysisText(text)  # 文本内容

                font = QFont()
                font.setFamily(attr["font-family"])
                font.setPointSize(int(attr["font-size"][0:-2]))

                textColor = attr["color"]
                textColor = textColor[5:-1].split(",")
                textColor = [int(textColor[i]) for i in range(3)
                             ] + [int(float(textColor[3]) * 255)]
                textColor = QColor(*textColor)

                backgroundColor = attr["background-color"]
                if backgroundColor == "none":
                    backgroundColor = None
                else:
                    backgroundColor = backgroundColor[5:-1].split(",")
                    backgroundColor = [
                        int(backgroundColor[i]) for i in range(3)
                    ] + [int(float(backgroundColor[3]) * 255)]
                    backgroundColor = QColor(*backgroundColor)

                block.addTextItem(text,
                                  font=font,
                                  textColor=textColor,
                                  backgroundColor=backgroundColor)
            text = f.readline()
        return document

コード例 #13

0

ファイルを表示

def show():
    query = 'Please input the Pdf!'
    try:
        file = request.form['path']
        # print("-------------file",file)
    except:
        file = None
        print("-------------", query)
        return query
    try:
        doc = Document()
        response = doc.identify_doc(file)
        temp = json.dumps(response)
    except:
        temp = 'Try Again'
    return temp

コード例 #14

0

ファイルを表示

ファイル: search_engine.py プロジェクト: mikekmiec/Education

 def __init__(self, dir):
     """
     Constructs a new SearchEngine object with the given directory name.
     It contains set of the documents, set of all words, inverse
     index, and the dictionary of word: IDF of the word.
     """
     self._dname = dir
     self._docs = set()
     for file in os.listdir(dir):
         doc = Document(dir + '/' + file)
         self._docs.add(doc)
     self._all_words = self._get_all_words()
     self._inverse_index = self._get_inverse_index()
     self._word_idf = dict()
     for word in self._all_words:
         self._word_idf[word] = self._calculate_idf(word)

コード例 #15

0

ファイルを表示

 def __init__(self, directory):
     """
     Inititializes a SearchEngine object with given file
     directory as a paramter.
     """
     self._file_count = 0
     self._inverted_index = {}
     for file_name in os.listdir(directory):
         self._file_count += 1
         path = directory + '/' + file_name
         doc = Document(path)
         for word in doc.get_words():
             if word not in self._inverted_index.keys():
                 self._inverted_index[word] = [doc]
             else:
                 self._inverted_index[word].append(doc)

コード例 #16

0

ファイルを表示

    def search_documents(self):
        ws = self.app.config.get('paths', 'workspace')
        if not exists(ws):
            return
        docs = []
        for item in listdir(ws):
            fn = join(ws, item, 'project.json')
            if not exists(fn):
                continue
            doc = Document()
            doc.load(fn)
            docs.append((doc, fn))

        docs.sort(lambda a, b: cmp(a[0].infos.time_modification, b[0].infos.time_modification))
        for doc, filename in docs:
            self.load_document(doc, filename)

コード例 #17

0

ファイルを表示

ファイル: block.py プロジェクト: Kairs0/information-retrieval

    def create_docs(self):
        """
        This function is called before launching the MapReduce workers.

        It created a Document Instance (with an unique id and the path to the corresponding file)
        for every files in the block.
        """
        new_doc_id = self.collection.doc_id_offset
        input_files_tuple = []
        for filename in self.input_files:
            new_doc_id += 1
            new_doc = Document(filename, new_doc_id)
            input_files_tuple.append((new_doc_id, filename))
            self.documents.append(new_doc)
        self.input_files = input_files_tuple
        self.collection.doc_id_offset = new_doc_id

コード例 #18

0

ファイルを表示

ファイル: corpus.py プロジェクト: ehutt/text2topics

    def remove_common_words(self, n_words):
        """Remove n extra words based on raw count."""

        extra_stops = self.common_words(n_words)
        extra_stops = list(extra_stops.index)
        extra_stops.append('-PRON-')
        clean_docs = list()

        for doc in self.tokens:
            words = [w for w in doc if w not in extra_stops]
            text = ' '.join(words)
            document = Document(text)
            clean_docs.append(document)

        new_corpus = Corpus(clean_docs)
        return new_corpus, extra_stops

コード例 #19

0

ファイルを表示

ファイル: document_test.py プロジェクト: shijiqun/mltk

    def setUp(self):
        self.document = Document(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load("../testdata/vocabulary.dat")

        self.model = Model(20)
        self.model.load('../testdata/lda_model')

        self.doc_tokens = [
            'macbook',
            'ipad',  # exist in vocabulary and model
            'mac os x',
            'chrome',  # only exist in vocabulary
            'nokia',
            'null'
        ]  # inexistent

コード例 #20

0

ファイルを表示

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """
        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        indices = doc_as_list[4]
        retweet_text = doc_as_list[5]
        retweet_url = doc_as_list[6]
        quote_text = doc_as_list[7]
        quote_url = doc_as_list[8]
        term_dict = {}
        if "http" in full_text:
            if url != "{}":
                split_url = url.split('"')
                self.url = self.url_Opretion(split_url[3])
                # self.text_operation(self.url)
                # if len(index)>2:
                #     index_strart = int(index[0][2:])
                #     index_end = int(index[1][:-1])
                # else:
                #     index_strart= int(index[0][2:])
                #     index_end= int(index[1][:-2])
                # if index_strart == 117 and index_end ==140: # problematic indexes
                #     pass
                # else:
                #     full_text = full_text[:index_strart] + split_url[3] + full_text[index_end:]

        full_text = full_text.replace(",", "")
        tokenized_text = self.tokenizer.tokenize(full_text)
        tokenized_text = self.text_operation(tokenized_text)
        tokenized_text = self.parse_sentence(tokenized_text)
        self.words_with_garbage = self.text_operation(self.words_with_garbage)
        tokenized_text.extend(self.url)
        self.url = []
        tokenized_text.extend(self.words_with_garbage)
        self.words_with_garbage = []
        doc_length = len(tokenized_text)  # after text operations.
        uniq_max_freq = self.calc_uniq_max_freq(tokenized_text, term_dict)
        document = Document(tweet_id, tweet_date, full_text, url, retweet_text,
                            retweet_url, quote_text, quote_url, term_dict,
                            doc_length, uniq_max_freq[0], uniq_max_freq[1])
        return document

コード例 #21

0

ファイルを表示

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        terms_list = self.parse_all_text(full_text)
        full_text = ' '.join(terms_list)
        url = doc_as_list[3]
        # url = self.parse_URL(url)
        # indices = doc_as_list[4]
        # retweet_text = doc_as_list[5]
        # retweet_text=self.parse_all_text(
        #   retweet_text, self.curr_idx)
        # retweet_url = doc_as_list[6]
        # retweet_url = self.parse_URL(url)
        # retweet_indices = doc_as_list[7]
        # quote_text = doc_as_list[8]
        # quote_url = doc_as_list[9]

        term_dict = {}
        # tokenized_text = self.parse_sentence(full_text)
        doc_length = len(terms_list)  # after text operations.

        for term in terms_list:
            if self.steamer is not None and term.isalpha(
            ) and '@' not in term and '#' not in term and 'http' not in term:
                term = self.steamer.stem_term(term)
            term = term.lower()
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1

        # document = Document(tweet_id, tweet_date, full_text, url,
        #                    term_dict, doc_length)

        self.doc_idx_tweet_id[self.curr_idx] = tweet_id

        # return [tweet_id, tweet_date, full_text, url,term_dict, doc_length]
        return Document(tweet_id, tweet_date, full_text, url, term_dict,
                        doc_length)

コード例 #22

0

ファイルを表示

ファイル: parser.py プロジェクト: praveen97uma/docsearch

    def parse_document(cls, url, raw_text):
        words = TextParser.parse(raw_text)
        term_store = TermStoreFactory.get_store()

        doc = Document(url)
        doc_terms = []
        for index, word in enumerate(words):
            term_id = term_store.add_term(word)
            term = TermFactory.create(term_id, word, index)
            doc_terms.append(term)

        doc.terms = doc_terms

        doc_store = DocumentStoreFactory.get_store()
        doc_store.add_document(doc)

        return doc

コード例 #23

0

ファイルを表示

ファイル: hw4_test.py プロジェクト: AdamK42/cse163-homework

def test_document_terms_field():
    '''
    Tests the construction of the terms field of a document.
    '''
    test = Document(FILE1)

    expected = {
        'i': 0.1,
        'like': 0.1,
        'apple': 0.2,
        'pie': 0.2,
        'is': 0.1,
        'super': 0.1,
        'duper': 0.1,
        'cool': 0.1
    }
    assert_equals(expected, test._terms)

コード例 #24

0

ファイルを表示

    def test_check_ContainsAutoExecutableMacro2_LogsError(self, popen_mock):
        olevba_mock = Mock()
        olevba_mock.configure_mock(**{'stdout.read.return_value': '| AutoExec   | AutoOpen'})

        file_mock = Mock()
        file_mock.configure_mock(**{'stdout.read.return_value': 'application/vnd.ms-excel'})
        subprocess.Popen.side_effect = [file_mock, olevba_mock]

        self.logger.error = Mock()

        file_name = 'document_with_vba.doc'
        Document(file_name).initialize().check()

        assert isinstance(subprocess.Popen, Mock)
        self._assert_popen_call(self._setup_file_call(file_name))
        self._assert_popen_call(self._setup_olevba_call(file_name))
        self.logger.error.assert_called_once_with('VIRUS Contains macro(s) that execute automatically')

コード例 #25

0

ファイルを表示

ファイル: term_frequency_calculator.py プロジェクト: Nuurek/TermFrequencyCalculator

    def search(self, query) -> List[SearchResult]:
        query_document = Document(query, query)
        self._prepare_document(query_document)
        self._calculate_document_bag_of_words(query_document)
        self._calculate_document_term_frequencies(query_document)
        self._calculate_document_inverse_term_frequencies(query_document)
        self._calculate_document_vector_length(query_document)

        results = []
        for document in self._documents:
            similarity = self._calculate_documents_similarity(
                document, query_document)
            results.append(SearchResult(similarity, document))

        results.sort(reverse=True)

        return results

コード例 #26

0

ファイルを表示

ファイル: tools.py プロジェクト: blablabla2013/predictice

    def test_get_annotated(self):

        text = 'Lorem ipsum dolor sit amet. Consectetur adipiscing elit. Sed do eiusmod tempor incididunt.'
        tagged_text = '<p><span>Lorem</span> ipsum dolor sit <span>amet</span>.</p><p>Consectetur adipiscing <span>elit</span>.</p><p>Sed do eiusmod tempor <span>incididunt</span>.</p>'
        lst_annotations = [
            Annotation('sentence', 0, 27),
            Annotation('sentence', 28, 56),
            Annotation('sentence', 57, 90),
            Annotation('word', 22, 26),
            Annotation('word', 51, 55),
            Annotation('word', 79, 89),
            Annotation('word', 0, 5)
        ]

        tagged_text_test = get_annotated(
            Document(text, 'test', lst_annotations))
        self.assertEqual(tagged_text_test, tagged_text)

コード例 #27

0

ファイルを表示

ファイル: parser_module.py プロジェクト: shachar58/IR-search-engine

    def parse_doc(self, doc_as_list):
        """
        This function takes a tweet document as list and break it into different fields
        :param doc_as_list: list re-preseting the tweet.
        :return: Document object with corresponding fields.
        """

        tweet_id = doc_as_list[0]
        tweet_date = doc_as_list[1]
        full_text = doc_as_list[2]
        url = doc_as_list[3]
        quote_text = doc_as_list[8]
        quote_url = doc_as_list[9]
        term_dict = {}

        #print(full_text)

        try:
            tokenized_text = self.parse_sentence(full_text)
        except:
            print(full_text)
            tokenized_text = []
        # print(tokenized_text)
        # print('---------------------------------------------------------')

        if self.include_urls:
            tokenized_text += self._parse_urls(url)

        if self.include_quote and quote_text is not None:
            tokenized_text += self.parse_sentence(quote_text)

        if self.include_quote and self.include_urls and quote_url is not None:
            tokenized_text += self._parse_urls(quote_url)

        doc_length = len(tokenized_text)  # after text operations.

        for term in tokenized_text:
            if term not in term_dict.keys():
                term_dict[term] = 1
            else:
                term_dict[term] += 1

        document = Document(tweet_id, tweet_date, full_text, url, retweet_text=None, retweet_url=None,
                            quote_text=quote_text, quote_url=quote_url, term_doc_dictionary=term_dict,
                            doc_length=doc_length)
        return document

コード例 #28

0

ファイルを表示

ファイル: corpus.py プロジェクト: shuheng-liu/keyword-extraction-chinese

    def __init__(self,
                 dir='',
                 filenames=None,
                 extensions=DEFAULT_ALLOWED_EXTENSIONS,
                 stopwords=None,
                 puncts=None):
        """
        :param dir: directory containing .txt files used as corpus
        :param filenames: ordered list of file paths, used together with`dir`
        :param extensions: legal extensions of documents, used to automatically fetch documents
        :param stopwords: list of stopwords
        :param puncts: list of punctuations
        """
        if filenames is None:  # auto-detect document files
            if not (isinstance(dir, str) and os.path.isdir(dir)):
                raise IOError("{} is not a directory".format(dir))
            filenames = [
                f for f in os.listdir(dir)
                if "." in f and f.rsplit(".")[-1] in extensions
            ]

        self.documents = [
            Document(os.path.join(dir, f), stopwords=stopwords, puncts=puncts)
            for f in filenames
        ]
        self.tokens = [doc.tokens for doc in self.documents]
        self.spaced = [doc.spaced for doc in self.documents]
        self.dictionary = Dictionary(self.tokens)
        self.ids = ...
        self.bow = ...
        self.tfidf_model = ...
        self.tfidf_score = ...
        self.tfidf_keywords = ...
        self.spans = ...
        self.tfidf_span_score = ...
        self.tfidf_span_keywords = ...
        self.textrank_keywords = ...

        # setup the model
        self._set_ids()
        self._set_bow()
        self._set_tfidf_model()
        self._set_tfidf_score()
        self._set_spans()
        self._set_tfidf_span_score()

コード例 #29

0

ファイルを表示

ファイル: parser.py プロジェクト: wuwenshan/RI

 def creationDicoDocs(self):
     dicoDocs = dict()
     fichier = open(self._collection, "r")
     for line in fichier:
         if line.startswith(".I"):
             docu = Document()
             num_doc = int(line.split(" ")[1].strip())
             docu.setI(num_doc)
             dicoDocs[num_doc] = docu
         elif line.startswith(".T") or line.startswith(
                 ".K") or line.startswith(".B") or line.startswith(
                     ".W") or line.startswith(".N") or line.startswith(
                         ".X") or line.startswith(".A"):
             balise = line.split(" ")[0].strip()
         elif not line.startswith("\n"):
             docu.setContenu(balise, line)
     fichier.close()
     return dicoDocs

コード例 #30

0

ファイルを表示

ファイル: kn.py プロジェクト: filippov70/openLand

    def doAddDocument(self):
        if self.dlgDocument == None:
            self.dlgDocument = Document(self.iface)
            self.dlgDocument.fillTree()

        self.dlgDocument.selectedGuid = None

        self.dlgDocument.exec_()
        if self.dlgDocument.selectedGuid != None:
            listNames = ['id_kn', 'guid_document']
            listValues = [[self.id_kn, self.dlgDocument.guidDocument]]

            if insertFeatures('pb_kn_document', listNames, listValues):
                self.dlgFill()

            else:
                QMessageBox.warning(self.iface.mainWindow(), u'Ошибка',
                                    u'Произошла ошибка добавления документа')