def predict(model, meanX, stdX): # Load test data. Modified from loadTrainData header = ["\"id\"", "\"tags\""] outFile = open("../submit.csv", "w") outFile.write(",".join(header) + "\n") with open("../data/test.csv", "r") as inFile: csvReader = csv.reader(inFile, quotechar='"', delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True) next(csvReader, None) # skip header docs = [Document(l, "test") for l in csvReader] totalCount = util.calTotalCount(docs) for d in range(len(docs)): invVoc = {v: k for k, v in docs[d].vocab.iteritems()} docs[d].addTFIDF(totalCount, len(docs), True) fea = docs[d].getFeatures() fea, meanX, stdX = util.featureNorm(fea, meanX, stdX) # docs[d].debug() # labels = model.predict_classes(fea, batch_size=1, verbose=0) # print labels labels = model.predict_proba(fea, batch_size=1, verbose=0) # print labels posIdx = [] posProb = [] for ct1 in range(labels.shape[0]): if labels[ct1, 0] < labels[ct1, 1]: posIdx.append(ct1) posProb.append(labels[ct1, 1]) sortIdx = np.argsort(posProb).tolist() realIdx = [posIdx[i] for i in sortIdx] tags = [] realLen = len(realIdx) if realLen == 1: tags.append(invVoc[realIdx[-1]]) elif realLen == 2: tags.append(invVoc[realIdx[-1]]) tags.append(invVoc[realIdx[-2]]) elif realLen > 2: tags.append(invVoc[realIdx[-1]]) for ct1 in range(2): tmp = -2 - ct1 if labels[realIdx[tmp]][1] > 0.8: tags.append(invVoc[realIdx[tmp]]) outFile.write("\"%d\"," % (docs[d].docId)) outFile.write("\"" + " ".join(tags) + "\"\n") outFile.close()
def sign_and_store_document(rein, doc_type, document, signature_address=None, signature_key=None, store=True): """ Save document if no signature key provided. Otherwise sign document, then validate and store it. """ validated = False if signature_key is None: # signing will happen outside app f = open(doc_type + '.txt', 'w') f.write(document) f.close() click.echo("\n%s\n" % document) done = False while not done: filename = click.prompt("File containing signed document", type=str, default=doc_type + '.sig.txt') if os.path.isfile(filename): done = True f = open(filename, 'r') signed = f.read() res = validate_enrollment(signed) if res: validated = True else: # sign with stored delegate key signature = sign(signature_key, document) validated = verify(signature_address, document, signature) if validated: # insert signed document into documents table b = "-----BEGIN BITCOIN SIGNED MESSAGE-----" c = "-----BEGIN SIGNATURE-----" d = "-----END BITCOIN SIGNED MESSAGE-----" signed = "%s\n%s\n%s\n%s\n%s\n%s" % (b, document, c, signature_address, signature, d) click.echo('\n' + signed + '\n') if store: d = Document(rein, doc_type, signed, sig_verified=True, testnet=rein.testnet) rein.session.add(d) rein.session.commit() return d return validated
def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list representing the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] self.tweet_id = tweet_id tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] if type( url ) is float: # Handle the bug that some urls are read as 'nan' float url = "{}" url = url.replace("{", "").replace("}", "").replace('"', "").replace( "[", "").replace("]", "") retweet_text = doc_as_list[4] retweet_url = doc_as_list[5] quote_text = doc_as_list[6] quote_url = doc_as_list[7] url = url.replace("{", "").replace("}", "").replace('"', "").replace( "[", "").replace("]", "") if url: urls_index = [m.start() for m in re.finditer('http', url) ] # Find all start index of the http word urls = [url[:i - 1] if i - 1 > 0 else url[:i] for i in urls_index ] + [url[urls_index[-1]:]] # Match all url url = "".join( w + " " for w in urls) # Join on all urls with spaces as a separator url_dict = self.parse_sentence(url) else: url_dict = {} full_text_dict = self.parse_sentence(full_text) # Merge all dict objects to one with dictionaries unpacking term_dict = {**full_text_dict, **url_dict} # doc_length = len(term_dict) # after text operations. doc_length = sum(term_dict.values()) # after text operations. # To avoid tweets that do not follow any parsing rule. For example the full text is 'same' (stop word) document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length) return document
def __init__(self, dir): """ Constructs a new SearchEngine object given a directory name. """ self._docs = dict() self._dir = dir self._num_docs = 0 for file in os.listdir(dir): self._num_docs += 1 doc = Document(dir + '/' + file) for word in doc.get_words(): if word not in self._docs: self._docs[word] = [] self._docs[word].append(dir + '/' + file)
def generate_docs(self): if (self.__size == 'B'): numberDoc = random.randint(3, 5) elif (self.__size == 'S'): numberDoc = random.randint(1, 3) for d in range(numberDoc): format = random.choice(['cheque', 'invoice', 'pdf']) if (format == 'cheque'): rect = pygame.Rect(self.__rect.x, self.__rect.y, GLOBAL.CHEQUE_WIDTH, GLOBAL.CHEQUE_HEIGHT) else: rect = pygame.Rect(self.__rect.x, self.__rect.y, GLOBAL.DOC_WIDTH, GLOBAL.DOC_HEIGHT) doc = Document(format, rect) self.__docs.append(doc)
def pipeline(): """Build inverted index pipeline.""" # read docs. docs = util.get_docs() # init connector r_p, r_d, r_o = util.redis_init() # build for f in tqdm(docs): doc = Document(f, r_p, r_d) doc_terms = doc.terms doc.store(doc_terms) # store global statis, num of documents # and each document length r_o.set('num_docs', len(docs)) r_o.set(doc.name, len(doc_terms))
def load_clean_corpus(clean_path): clean_file = clean_path + 'clean.pkl' clean_text = pickle.load(open(clean_file, "rb")) clean_docs = list() for text in clean_text: doc = Document(text) clean_docs.append(doc) clean_corpus = Corpus(clean_docs) print('Loaded clean docs.') vector_file = clean_path + 'corpus_vectors.pkl' if os.path.exists(vector_file): vectors = pickle.load(open(vector_file, "rb")) clean_corpus.vectors = vectors print('Loaded corpus vectors.') return clean_corpus
def raster_test(): doc = Document("Untitled-1") width = 100 height = 100 image = Image.new("L", (width, height)) draw = ImageDraw.Draw(image) draw_circle(draw, width / 2, height / 2, width / 2 - 10, 255) draw_circle(draw, width * 3 / 4, height / 4, width / 5, 0) image.save("prntest.png") raster = Raster(image, 100, 100, 100, 100) doc.addRaster(raster) return doc
def test_018(self): """ Document document setter - valid PDF file with page directory """ document = Document() document.dir = "tests" document.document = "tests/4page.pdf" self.assertEqual(document.name, "4page") self.assertEqual(len(document), 4) for i in range(1,5): self.assertTrue(os.path.isfile("tests/4page" + str(i) + ".pdf")) self.assertTrue(os.path.isfile("tests/4page" + str(i) + ".txt")) self.assertTrue(os.path.isfile("tests/4page" + str(i) + ".json")) for i in range(1,5): os.remove("tests/4page" + str(i) + ".pdf") os.remove("tests/4page" + str(i) + ".txt") os.remove("tests/4page" + str(i) + ".json")
def callbackContinue(self, response): result = response.result() if result is None: self.queue.get() else: url = result.url soup = BeautifulSoup(result.content, "lxml") if result.status_code == 200: doc = Document(url, parseInfo(soup)) self.documents.append(doc) self.findLinks(soup) self.queue.get() else: self.findLinks(soup) self.queue.get()
def process_corpus(self, corpus): docs = [] idf = [] cord_uids = set() self.doc_ids = [] self.word2id = dict() self.posting_list = dict() self.d_avg = 0 # posting_list: key: string, value: set word_index = 0 doc_index = 0 for i in range(len(corpus)): cord_uid = corpus["cord_uid"][i] if cord_uid in cord_uids: continue title = corpus["title"][i] title = "" if (not isinstance(title, str)) else title abstract = corpus["abstract"][i] abstract = "" if (not isinstance(abstract, str)) else abstract text = title + " " + abstract tokenized_text = self.process_text(text) # if the document is very short, skip it if len(tokenized_text) < THRESHOLD_MIN_TOKEN: continue doc = Document(tokenized_text) self.d_avg += len(tokenized_text) for word in tokenized_text: # add word to dictionary if word not in self.word2id: self.word2id[word] = word_index idf.append(0) word_index += 1 # add doc to posting_list if word not in self.posting_list: self.posting_list[word] = set() self.posting_list[word].add(doc_index) docs.append(doc) self.doc_ids.append(cord_uid) cord_uids.add(cord_uid) doc_index += 1 for word in doc.tf_dict.keys(): index = self.word2id[word] idf[index] += 1 if i % 100 == 0: print("{} / {}; {:.2f} %".format(i, len(corpus), i / len(corpus) * 100)) self.d_avg /= len(self.doc_ids) return idf, docs
def analysisHtml(self, f): document = Document() text = f.readline() while text: if text.startswith("<body"): document.documentWidth = int( self.analysisStyle(text)["width"][:-2]) # 去掉px字符 if text.startswith("<title"): document.title = self.analysisText(text) if text.startswith("<h1") or text.startswith( "<h2") or text.startswith("<h3") or text.startswith("<h4"): block = document.addTextBlock() text_ = self.analysisText(text) # 文本内容 block.addTextItem(text_, preTextItem=None) exec("block.setTitleLevel_(GlobalVars.T{})".format(text[2])) if text.startswith("<p"): block = document.addTextBlock() if text.startswith("<span"): attr = self.analysisStyle(text) # 属性 text = self.analysisText(text) # 文本内容 font = QFont() font.setFamily(attr["font-family"]) font.setPointSize(int(attr["font-size"][0:-2])) textColor = attr["color"] textColor = textColor[5:-1].split(",") textColor = [int(textColor[i]) for i in range(3) ] + [int(float(textColor[3]) * 255)] textColor = QColor(*textColor) backgroundColor = attr["background-color"] if backgroundColor == "none": backgroundColor = None else: backgroundColor = backgroundColor[5:-1].split(",") backgroundColor = [ int(backgroundColor[i]) for i in range(3) ] + [int(float(backgroundColor[3]) * 255)] backgroundColor = QColor(*backgroundColor) block.addTextItem(text, font=font, textColor=textColor, backgroundColor=backgroundColor) text = f.readline() return document
def show(): query = 'Please input the Pdf!' try: file = request.form['path'] # print("-------------file",file) except: file = None print("-------------", query) return query try: doc = Document() response = doc.identify_doc(file) temp = json.dumps(response) except: temp = 'Try Again' return temp
def __init__(self, dir): """ Constructs a new SearchEngine object with the given directory name. It contains set of the documents, set of all words, inverse index, and the dictionary of word: IDF of the word. """ self._dname = dir self._docs = set() for file in os.listdir(dir): doc = Document(dir + '/' + file) self._docs.add(doc) self._all_words = self._get_all_words() self._inverse_index = self._get_inverse_index() self._word_idf = dict() for word in self._all_words: self._word_idf[word] = self._calculate_idf(word)
def __init__(self, directory): """ Inititializes a SearchEngine object with given file directory as a paramter. """ self._file_count = 0 self._inverted_index = {} for file_name in os.listdir(directory): self._file_count += 1 path = directory + '/' + file_name doc = Document(path) for word in doc.get_words(): if word not in self._inverted_index.keys(): self._inverted_index[word] = [doc] else: self._inverted_index[word].append(doc)
def search_documents(self): ws = self.app.config.get('paths', 'workspace') if not exists(ws): return docs = [] for item in listdir(ws): fn = join(ws, item, 'project.json') if not exists(fn): continue doc = Document() doc.load(fn) docs.append((doc, fn)) docs.sort(lambda a, b: cmp(a[0].infos.time_modification, b[0].infos.time_modification)) for doc, filename in docs: self.load_document(doc, filename)
def create_docs(self): """ This function is called before launching the MapReduce workers. It created a Document Instance (with an unique id and the path to the corresponding file) for every files in the block. """ new_doc_id = self.collection.doc_id_offset input_files_tuple = [] for filename in self.input_files: new_doc_id += 1 new_doc = Document(filename, new_doc_id) input_files_tuple.append((new_doc_id, filename)) self.documents.append(new_doc) self.input_files = input_files_tuple self.collection.doc_id_offset = new_doc_id
def remove_common_words(self, n_words): """Remove n extra words based on raw count.""" extra_stops = self.common_words(n_words) extra_stops = list(extra_stops.index) extra_stops.append('-PRON-') clean_docs = list() for doc in self.tokens: words = [w for w in doc if w not in extra_stops] text = ' '.join(words) document = Document(text) clean_docs.append(document) new_corpus = Corpus(clean_docs) return new_corpus, extra_stops
def setUp(self): self.document = Document(20) self.vocabulary = Vocabulary() self.vocabulary.load("../testdata/vocabulary.dat") self.model = Model(20) self.model.load('../testdata/lda_model') self.doc_tokens = [ 'macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null' ] # inexistent
def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] indices = doc_as_list[4] retweet_text = doc_as_list[5] retweet_url = doc_as_list[6] quote_text = doc_as_list[7] quote_url = doc_as_list[8] term_dict = {} if "http" in full_text: if url != "{}": split_url = url.split('"') self.url = self.url_Opretion(split_url[3]) # self.text_operation(self.url) # if len(index)>2: # index_strart = int(index[0][2:]) # index_end = int(index[1][:-1]) # else: # index_strart= int(index[0][2:]) # index_end= int(index[1][:-2]) # if index_strart == 117 and index_end ==140: # problematic indexes # pass # else: # full_text = full_text[:index_strart] + split_url[3] + full_text[index_end:] full_text = full_text.replace(",", "") tokenized_text = self.tokenizer.tokenize(full_text) tokenized_text = self.text_operation(tokenized_text) tokenized_text = self.parse_sentence(tokenized_text) self.words_with_garbage = self.text_operation(self.words_with_garbage) tokenized_text.extend(self.url) self.url = [] tokenized_text.extend(self.words_with_garbage) self.words_with_garbage = [] doc_length = len(tokenized_text) # after text operations. uniq_max_freq = self.calc_uniq_max_freq(tokenized_text, term_dict) document = Document(tweet_id, tweet_date, full_text, url, retweet_text, retweet_url, quote_text, quote_url, term_dict, doc_length, uniq_max_freq[0], uniq_max_freq[1]) return document
def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] terms_list = self.parse_all_text(full_text) full_text = ' '.join(terms_list) url = doc_as_list[3] # url = self.parse_URL(url) # indices = doc_as_list[4] # retweet_text = doc_as_list[5] # retweet_text=self.parse_all_text( # retweet_text, self.curr_idx) # retweet_url = doc_as_list[6] # retweet_url = self.parse_URL(url) # retweet_indices = doc_as_list[7] # quote_text = doc_as_list[8] # quote_url = doc_as_list[9] term_dict = {} # tokenized_text = self.parse_sentence(full_text) doc_length = len(terms_list) # after text operations. for term in terms_list: if self.steamer is not None and term.isalpha( ) and '@' not in term and '#' not in term and 'http' not in term: term = self.steamer.stem_term(term) term = term.lower() if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 # document = Document(tweet_id, tweet_date, full_text, url, # term_dict, doc_length) self.doc_idx_tweet_id[self.curr_idx] = tweet_id # return [tweet_id, tweet_date, full_text, url,term_dict, doc_length] return Document(tweet_id, tweet_date, full_text, url, term_dict, doc_length)
def parse_document(cls, url, raw_text): words = TextParser.parse(raw_text) term_store = TermStoreFactory.get_store() doc = Document(url) doc_terms = [] for index, word in enumerate(words): term_id = term_store.add_term(word) term = TermFactory.create(term_id, word, index) doc_terms.append(term) doc.terms = doc_terms doc_store = DocumentStoreFactory.get_store() doc_store.add_document(doc) return doc
def test_document_terms_field(): ''' Tests the construction of the terms field of a document. ''' test = Document(FILE1) expected = { 'i': 0.1, 'like': 0.1, 'apple': 0.2, 'pie': 0.2, 'is': 0.1, 'super': 0.1, 'duper': 0.1, 'cool': 0.1 } assert_equals(expected, test._terms)
def test_check_ContainsAutoExecutableMacro2_LogsError(self, popen_mock): olevba_mock = Mock() olevba_mock.configure_mock(**{'stdout.read.return_value': '| AutoExec | AutoOpen'}) file_mock = Mock() file_mock.configure_mock(**{'stdout.read.return_value': 'application/vnd.ms-excel'}) subprocess.Popen.side_effect = [file_mock, olevba_mock] self.logger.error = Mock() file_name = 'document_with_vba.doc' Document(file_name).initialize().check() assert isinstance(subprocess.Popen, Mock) self._assert_popen_call(self._setup_file_call(file_name)) self._assert_popen_call(self._setup_olevba_call(file_name)) self.logger.error.assert_called_once_with('VIRUS Contains macro(s) that execute automatically')
def search(self, query) -> List[SearchResult]: query_document = Document(query, query) self._prepare_document(query_document) self._calculate_document_bag_of_words(query_document) self._calculate_document_term_frequencies(query_document) self._calculate_document_inverse_term_frequencies(query_document) self._calculate_document_vector_length(query_document) results = [] for document in self._documents: similarity = self._calculate_documents_similarity( document, query_document) results.append(SearchResult(similarity, document)) results.sort(reverse=True) return results
def test_get_annotated(self): text = 'Lorem ipsum dolor sit amet. Consectetur adipiscing elit. Sed do eiusmod tempor incididunt.' tagged_text = '<p><span>Lorem</span> ipsum dolor sit <span>amet</span>.</p><p>Consectetur adipiscing <span>elit</span>.</p><p>Sed do eiusmod tempor <span>incididunt</span>.</p>' lst_annotations = [ Annotation('sentence', 0, 27), Annotation('sentence', 28, 56), Annotation('sentence', 57, 90), Annotation('word', 22, 26), Annotation('word', 51, 55), Annotation('word', 79, 89), Annotation('word', 0, 5) ] tagged_text_test = get_annotated( Document(text, 'test', lst_annotations)) self.assertEqual(tagged_text_test, tagged_text)
def parse_doc(self, doc_as_list): """ This function takes a tweet document as list and break it into different fields :param doc_as_list: list re-preseting the tweet. :return: Document object with corresponding fields. """ tweet_id = doc_as_list[0] tweet_date = doc_as_list[1] full_text = doc_as_list[2] url = doc_as_list[3] quote_text = doc_as_list[8] quote_url = doc_as_list[9] term_dict = {} #print(full_text) try: tokenized_text = self.parse_sentence(full_text) except: print(full_text) tokenized_text = [] # print(tokenized_text) # print('---------------------------------------------------------') if self.include_urls: tokenized_text += self._parse_urls(url) if self.include_quote and quote_text is not None: tokenized_text += self.parse_sentence(quote_text) if self.include_quote and self.include_urls and quote_url is not None: tokenized_text += self._parse_urls(quote_url) doc_length = len(tokenized_text) # after text operations. for term in tokenized_text: if term not in term_dict.keys(): term_dict[term] = 1 else: term_dict[term] += 1 document = Document(tweet_id, tweet_date, full_text, url, retweet_text=None, retweet_url=None, quote_text=quote_text, quote_url=quote_url, term_doc_dictionary=term_dict, doc_length=doc_length) return document
def __init__(self, dir='', filenames=None, extensions=DEFAULT_ALLOWED_EXTENSIONS, stopwords=None, puncts=None): """ :param dir: directory containing .txt files used as corpus :param filenames: ordered list of file paths, used together with`dir` :param extensions: legal extensions of documents, used to automatically fetch documents :param stopwords: list of stopwords :param puncts: list of punctuations """ if filenames is None: # auto-detect document files if not (isinstance(dir, str) and os.path.isdir(dir)): raise IOError("{} is not a directory".format(dir)) filenames = [ f for f in os.listdir(dir) if "." in f and f.rsplit(".")[-1] in extensions ] self.documents = [ Document(os.path.join(dir, f), stopwords=stopwords, puncts=puncts) for f in filenames ] self.tokens = [doc.tokens for doc in self.documents] self.spaced = [doc.spaced for doc in self.documents] self.dictionary = Dictionary(self.tokens) self.ids = ... self.bow = ... self.tfidf_model = ... self.tfidf_score = ... self.tfidf_keywords = ... self.spans = ... self.tfidf_span_score = ... self.tfidf_span_keywords = ... self.textrank_keywords = ... # setup the model self._set_ids() self._set_bow() self._set_tfidf_model() self._set_tfidf_score() self._set_spans() self._set_tfidf_span_score()
def creationDicoDocs(self): dicoDocs = dict() fichier = open(self._collection, "r") for line in fichier: if line.startswith(".I"): docu = Document() num_doc = int(line.split(" ")[1].strip()) docu.setI(num_doc) dicoDocs[num_doc] = docu elif line.startswith(".T") or line.startswith( ".K") or line.startswith(".B") or line.startswith( ".W") or line.startswith(".N") or line.startswith( ".X") or line.startswith(".A"): balise = line.split(" ")[0].strip() elif not line.startswith("\n"): docu.setContenu(balise, line) fichier.close() return dicoDocs
def doAddDocument(self): if self.dlgDocument == None: self.dlgDocument = Document(self.iface) self.dlgDocument.fillTree() self.dlgDocument.selectedGuid = None self.dlgDocument.exec_() if self.dlgDocument.selectedGuid != None: listNames = ['id_kn', 'guid_document'] listValues = [[self.id_kn, self.dlgDocument.guidDocument]] if insertFeatures('pb_kn_document', listNames, listValues): self.dlgFill() else: QMessageBox.warning(self.iface.mainWindow(), u'Ошибка', u'Произошла ошибка добавления документа')