def _populate_database(self, Session, filepath): for fq_path, root, basename in find_files(filepath, self._create_regex()): dbsession = Session() try: hash = calculate_hash(fq_path) contents = self.get_contents(fq_path) existing = dbsession.query(Document).filter_by( filepath=fq_path).first() if existing is None: doc = Document(filename=basename, filepath=fq_path, body=contents, hash=hash) dbsession.add(doc) elif existing.hash != hash: existing.hash = hash existing.body = contents dbsession.commit() except Exception: dbsession.rollback() finally: dbsession.close()
def is_already_sent(data, channel, priority=Priority.LOW): messages_already_sent = channel.messages.all() data_hash = calculate_hash(data) for message in messages_already_sent: if data_hash == message.text_hash: time_difference = datetime.now() - message.last_sent_on if not need_to_send(time_difference, priority): print "Message was recently sent to the channel" return True return False
def query(self, cell, halfway=False): model_spec = api.ModelSpec(**cell) md5hash = calculate_hash(cell) # Query this model from dataset, returns a dictionary containing the metrics # associated with this model. data = self.nasbench.query(model_spec) adj = data.pop('module_adjacency') data.pop('module_operations') data['hash'] = md5hash onerow = pd.DataFrame.from_records([data], index='hash') self.df = pd.concat([self.df, onerow]) return data, adj
def __init__(self, seed): # Set random seed to check reproducibility of results self.seed = seed np.random.seed(seed) # Load the data from file (this will take some time) self.nasbench = api.NASBench('./models/nasbench_only108.tfrecord', seed=seed) # Lines below are just to construct proper pandas column structure cell = self.random_cell() model_spec = api.ModelSpec(cell['matrix'], cell['ops']) data = self.nasbench.query(model_spec) md5hash = calculate_hash(cell) data.pop('module_adjacency') data.pop('module_operations') data['hash'] = md5hash self.df = pd.DataFrame.from_records([data], index='hash') self.df.drop(self.df.index, inplace=True) self.reset_budget( ) # Clear budgeting of this initial query as this was needed just to capture column names
def get_transaction_hash(self, tx, mode): """function takes raw hex tx as input returns the hash of the transactiaon of the input field """ logger.debug(tx) logger.debug(mode) if (mode == method_query): input_from_transaction = json.loads(tx)['input'] input_from_transaction_as_np = np.asarray(input_from_transaction) hash_of_transaction = calculate_hash(input_from_transaction_as_np) return hash_of_transaction elif (mode == method_upload): model_name_from_transaction = json.loads(tx)['name'] model_hash_from_transaction = json.loads(tx)['model'] model_url_from_transaction = json.loads(tx)['url'] return model_name_from_transaction, model_hash_from_transaction, model_url_from_transaction else: raise ValueError
def query_batch(self, size=1000): pbar = tqdm(total=size) random_cells = {} print("I01DT: Collecting unique architectures") random.seed(self.seed) while len(random_cells) < size: #cell_vertices = random.choice([3, 4, 5, 6, 7]) #cell = self.nb.random_cell(cell_vertices) # Lines above makes no sense = there is no way to obtain very low values of accuracy except of some outliers # See pdf file for more details cell = self.nb.random_cell() md5hash = str(calculate_hash(cell)) if md5hash not in random_cells: random_cells[md5hash] = cell pbar.update(1) pbar.close() print("I02DT: Evaluating unique architectures") random_results = {} for key in tqdm(random_cells): random_results[key] = self.nb.query(random_cells[key]) print("I03DT: Saving log df of observations") self.nb.save_df() print("I04DT: Plotting observed data") return self.nb.df.copy(), random_cells, random_results
def analyse_article_from_xml_dump(self, page): """ Analyse page from XML Dump Iterator. :param page: Page meta data and a Revision iterator. Each revision contains metadata and text. """ # Iterate over revisions of the article. for revision in page: text = revision.text or '' if not text and (revision.deleted.text or revision.deleted.restricted): # equivalent of "'texthidden' in revision or 'textmissing' in revision" in analyse_article continue vandalism = False # Update the information about the previous revision. self.revision_prev = self.revision_curr rev_id = revision.id rev_hash = revision.sha1 or calculate_hash(text) if rev_hash in self.spam_hashes: vandalism = True # TODO: spam detection: DELETION text_len = len(text) if not vandalism and not(revision.comment and revision.minor): # if content is not moved (flag) to different article in good faith, check for vandalism # if revisions have reached a certain size if self.revision_prev.length > PREVIOUS_LENGTH and \ text_len < CURR_LENGTH and \ ((text_len-self.revision_prev.length) / self.revision_prev.length) <= CHANGE_PERCENTAGE: # VANDALISM: CHANGE PERCENTAGE - DELETION vandalism = True if vandalism: # print("---------------------------- FLAG 1") self.revision_curr = self.revision_prev self.spam_ids.append(rev_id) self.spam_hashes.append(rev_hash) else: # Information about the current revision. self.revision_curr = Revision() self.revision_curr.id = rev_id self.revision_curr.length = text_len self.revision_curr.timestamp = revision.timestamp.long_format() # Get editor information if revision.user: user_text = revision.user.text contributor_name = '' if not user_text or user_text == 'None' else user_text if revision.user.id is None and contributor_name or revision.user.id == 0: contributor_id = 0 else: contributor_id = revision.user.id or '' else: # Some revisions don't have contributor. contributor_name = '' contributor_id = '' editor = contributor_id editor = str(editor) if editor != 0 else '0|{}'.format(contributor_name) self.revision_curr.editor = editor # Content within the revision. self.text_curr = text.lower() # Perform comparison. vandalism = self.determine_authorship() if vandalism: # print "---------------------------- FLAG 2" self.revision_curr = self.revision_prev # skip revision with vandalism in history self.spam_ids.append(rev_id) self.spam_hashes.append(rev_hash) else: # Add the current revision with all the information. self.revisions.update({self.revision_curr.id: self.revision_curr}) self.ordered_revisions.append(self.revision_curr.id) self.temp = []
def analyse_sentences_in_paragraphs(self, unmatched_paragraphs_curr, unmatched_paragraphs_prev): # Containers for unmatched and matched sentences. unmatched_sentences_curr = [] unmatched_sentences_prev = [] matched_sentences_prev = [] total_sentences = 0 # Iterate over the unmatched paragraphs of the current revision. for paragraph_curr in unmatched_paragraphs_curr: # Split the current paragraph into sentences. sentences = split_into_sentences(paragraph_curr.value) # Iterate over the sentences of the current paragraph for sentence in sentences: # Create the Sentence structure. sentence = sentence.strip() if not sentence: # dont track empty lines continue sentence = ' '.join(split_into_tokens(sentence)) # here whitespaces in the sentence are cleaned hash_curr = calculate_hash(sentence) # then hash values is calculated matched_curr = False total_sentences += 1 # Iterate over the unmatched paragraphs from the previous revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev in paragraph_prev.sentences.get(hash_curr, []): if not sentence_prev.matched: matched_one = False matched_all = True for word_prev in sentence_prev.words: if word_prev.matched: matched_one = True else: matched_all = False if not matched_one: # if there is not any already matched prev word, so set them all as matched sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if hash_curr in paragraph_curr.sentences: paragraph_curr.sentences[hash_curr].append(sentence_prev) else: paragraph_curr.sentences.update({sentence_prev.hash_value: [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif matched_all: # if all prev words in this sentence are already matched sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) if matched_curr: break # Iterate over the hash table of sentences from old revisions. if not matched_curr: for sentence_prev in self.sentences_ht.get(hash_curr, []): if not sentence_prev.matched: matched_one = False matched_all = True for word_prev in sentence_prev.words: if word_prev.matched: matched_one = True else: matched_all = False if not matched_one: # if there is not any already matched prev word, so set them all as matched sentence_prev.matched = True matched_curr = True matched_sentences_prev.append(sentence_prev) for word_prev in sentence_prev.words: word_prev.matched = True # Add the sentence information to the paragraph. if hash_curr in paragraph_curr.sentences: paragraph_curr.sentences[hash_curr].append(sentence_prev) else: paragraph_curr.sentences.update({sentence_prev.hash_value: [sentence_prev]}) paragraph_curr.ordered_sentences.append(sentence_prev.hash_value) break elif matched_all: # if all prev words in this sentence are already matched sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) # If the sentence did not match, # then include in the container of unmatched sentences for further analysis. if not matched_curr: sentence_curr = Sentence() sentence_curr.value = sentence sentence_curr.hash_value = hash_curr if hash_curr in paragraph_curr.sentences: paragraph_curr.sentences[hash_curr].append(sentence_curr) else: paragraph_curr.sentences.update({sentence_curr.hash_value: [sentence_curr]}) paragraph_curr.ordered_sentences.append(sentence_curr.hash_value) unmatched_sentences_curr.append(sentence_curr) # Identify the unmatched sentences in the previous paragraph revision. for paragraph_prev in unmatched_paragraphs_prev: for sentence_prev_hash in paragraph_prev.ordered_sentences: if len(paragraph_prev.sentences[sentence_prev_hash]) > 1: s = 's-{}-{}'.format(paragraph_prev, sentence_prev_hash) self.temp.append(s) count = self.temp.count(s) sentence_prev = paragraph_prev.sentences[sentence_prev_hash][count - 1] else: sentence_prev = paragraph_prev.sentences[sentence_prev_hash][0] if not sentence_prev.matched: unmatched_sentences_prev.append(sentence_prev) # to reset 'matched words in analyse_words_in_sentences' of unmatched paragraphs and sentences sentence_prev.matched = True matched_sentences_prev.append(sentence_prev) return unmatched_sentences_curr, unmatched_sentences_prev, matched_sentences_prev, total_sentences
def analyse_paragraphs_in_revision(self): # Containers for unmatched and matched paragraphs. unmatched_paragraphs_curr = [] unmatched_paragraphs_prev = [] matched_paragraphs_prev = [] # Split the text of the current into paragraphs. paragraphs = split_into_paragraphs(self.text_curr) # Iterate over the paragraphs of the current version. for paragraph in paragraphs: # Build Paragraph structure and calculate hash value. paragraph = paragraph.strip() if not paragraph: # dont track empty lines continue # TODO should we clean whitespaces in paragraph level? # paragraph = ' '.join(split_into_tokens(paragraph)) hash_curr = calculate_hash(paragraph) matched_curr = False # If the paragraph is in the previous revision, # update the authorship information and mark both paragraphs as matched (also in HT). for paragraph_prev in self.revision_prev.paragraphs.get(hash_curr, []): if not paragraph_prev.matched: matched_one = False matched_all = True for h in paragraph_prev.sentences: for s_prev in paragraph_prev.sentences[h]: for w_prev in s_prev.words: if w_prev.matched: matched_one = True else: matched_all = False if not matched_one: # if there is not any already matched prev word, so set them all as matched matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # Set all sentences and words of this paragraph as matched for hash_sentence_prev in paragraph_prev.sentences: for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if hash_curr in self.revision_curr.paragraphs: self.revision_curr.paragraphs[hash_curr].append(paragraph_prev) else: self.revision_curr.paragraphs.update({paragraph_prev.hash_value: [paragraph_prev]}) self.revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break elif matched_all: # if all prev words in this paragraph are already matched paragraph_prev.matched = True # for hash_sentence_prev in paragraph_prev.sentences: # for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: # sentence_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # If the paragraph is not in the previous revision, but it is in an older revision # update the authorship information and mark both paragraphs as matched. if not matched_curr: for paragraph_prev in self.paragraphs_ht.get(hash_curr, []): if not paragraph_prev.matched: matched_one = False matched_all = True for h in paragraph_prev.sentences: for s_prev in paragraph_prev.sentences[h]: for w_prev in s_prev.words: if w_prev.matched: matched_one = True else: matched_all = False if not matched_one: # if there is not any already matched prev word, so set them all as matched matched_curr = True paragraph_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # Set all sentences and words of this paragraph as matched for hash_sentence_prev in paragraph_prev.sentences: for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: sentence_prev.matched = True for word_prev in sentence_prev.words: word_prev.matched = True # Add paragraph to current revision. if hash_curr in self.revision_curr.paragraphs: self.revision_curr.paragraphs[hash_curr].append(paragraph_prev) else: self.revision_curr.paragraphs.update({paragraph_prev.hash_value: [paragraph_prev]}) self.revision_curr.ordered_paragraphs.append(paragraph_prev.hash_value) break elif matched_all: # if all prev words in this paragraph are already matched paragraph_prev.matched = True # for hash_sentence_prev in paragraph_prev.sentences: # for sentence_prev in paragraph_prev.sentences[hash_sentence_prev]: # sentence_prev.matched = True matched_paragraphs_prev.append(paragraph_prev) # If the paragraph did not match with previous revisions, # add to container of unmatched paragraphs for further analysis. if not matched_curr: paragraph_curr = Paragraph() paragraph_curr.hash_value = hash_curr paragraph_curr.value = paragraph if hash_curr in self.revision_curr.paragraphs: self.revision_curr.paragraphs[hash_curr].append(paragraph_curr) else: self.revision_curr.paragraphs.update({paragraph_curr.hash_value: [paragraph_curr]}) self.revision_curr.ordered_paragraphs.append(paragraph_curr.hash_value) unmatched_paragraphs_curr.append(paragraph_curr) # Identify unmatched paragraphs in previous revision for further analysis. for paragraph_prev_hash in self.revision_prev.ordered_paragraphs: if len(self.revision_prev.paragraphs[paragraph_prev_hash]) > 1: s = 'p-{}-{}'.format(self.revision_prev, paragraph_prev_hash) self.temp.append(s) count = self.temp.count(s) paragraph_prev = self.revision_prev.paragraphs[paragraph_prev_hash][count - 1] else: paragraph_prev = self.revision_prev.paragraphs[paragraph_prev_hash][0] if not paragraph_prev.matched: unmatched_paragraphs_prev.append(paragraph_prev) return unmatched_paragraphs_curr, unmatched_paragraphs_prev, matched_paragraphs_prev
def analyse_article(self, page): """ Analyse page in json form. :param page: List of revisions. Each revision is a dict and contains metadata and text. """ # Iterate over revisions of the article. for revision in page: if 'texthidden' in revision or 'textmissing' in revision: continue vandalism = False # Update the information about the previous revision. self.revision_prev = self.revision_curr text = revision.get('*', '') rev_id = int(revision['revid']) rev_hash = revision.get('sha1', calculate_hash(text)) if rev_hash in self.spam_hashes: vandalism = True # TODO: spam detection: DELETION text_len = len(text) if not vandalism and not(revision.get('comment') and 'minor' in revision): # if content is not moved (flag) to different article in good faith, check for vandalism # if revisions have reached a certain size if self.revision_prev.length > PREVIOUS_LENGTH and \ text_len < CURR_LENGTH and \ ((text_len-self.revision_prev.length) / self.revision_prev.length) <= CHANGE_PERCENTAGE: # VANDALISM: CHANGE PERCENTAGE - DELETION vandalism = True if vandalism: # print("---------------------------- FLAG 1") self.revision_curr = self.revision_prev self.spam_ids.append(rev_id) self.spam_hashes.append(rev_hash) else: # Information about the current revision. self.revision_curr = Revision() self.revision_curr.id = rev_id self.revision_curr.length = text_len self.revision_curr.timestamp = revision['timestamp'] # Get editor information. # Some revisions don't have editor. contributor_id = revision.get('userid', '') contributor_name = revision.get('user', '') editor = contributor_id editor = str(editor) if editor != 0 else '0|{}'.format(contributor_name) self.revision_curr.editor = editor # Content within the revision. self.text_curr = text.lower() # Perform comparison. vandalism = self.determine_authorship() if vandalism: # print "---------------------------- FLAG 2" self.revision_curr = self.revision_prev # skip revision with vandalism in history self.spam_ids.append(rev_id) self.spam_hashes.append(rev_hash) else: # Add the current revision with all the information. self.revisions.update({self.revision_curr.id: self.revision_curr}) self.ordered_revisions.append(self.revision_curr.id) self.temp = [] break
def generate_metadata_file(data_file, meta_file): content = {'hash': calculate_hash(data_file), "time": str(time.time())} with open(meta_file, 'w') as f: json.dump(content, f) log(f"Metadata file {meta_file} generated")