def main(): tag = Tag.nodes.get(name=Tag.FILTER_1) total_diffusion_instances = 0 for node in tag.users: print('processing {}'.format(node.screen_name)) diffusion_instances = 0 # get tweets of transnational user query = ( ' MATCH (user:Node {{screen_name:"{}"}})'.format(node.screen_name) + ' MATCH (user)-[:STATUS]->(statuses)' ' RETURN statuses.text, statuses.date, user.screen_name') statuses, meta = db.cypher_query(query) for index, status in enumerate(statuses): print('{}, {}/{}'.format(node.screen_name, index, len(statuses)), end='\r') # five days in unix time time_delta = 60 * 60 * 24 * 5 # collect friend statuses before status query = ( ' MATCH (user:Node {{screen_name:"{name}"}})' ' MATCH (user)-[:FRIEND]->(connections)' ' MATCH (connections)-[:STATUS]->(statuses)' ' MATCH (statuses {{lang:"en"}})' ' MATCH (statuses)<-[:STATUS]-(nodes)' ' WHERE statuses.date > {min_date} and statuses.date < {max_date}' ' RETURN statuses.text, statuses.date, nodes.screen_name' ' ORDER BY statuses.date DESC').format( name=node.screen_name, min_date=status[1] - time_delta, max_date=status[1]) friend_statuses, meta = db.cypher_query(query) # collect follow statuses posted after status query = ( ' MATCH (user:Node {{screen_name:"{name}"}})' ' MATCH (user)-[:FOLLOWER]->(connections)' ' MATCH (connections)-[:STATUS]->(statuses)' ' MATCH (statuses {{lang:"en"}})' ' MATCH (statuses)<-[:STATUS]-(nodes)' ' WHERE statuses.date > {min_date} and statuses.date < {max_date}' ' RETURN statuses.text, statuses.date, nodes.screen_name' ' ORDER BY statuses.date DESC').format( name=node.screen_name, min_date=status[1], max_date=status[1] + time_delta) follower_statuses, meta = db.cypher_query(query) if(friend_statuses and follower_statuses): # cluster and identify diffusion if identify_transnational_diffusion(node, len(friend_statuses), friend_statuses + [status] + follower_statuses, output=False): diffusion_instances += 1 total_diffusion_instances += diffusion_instances print('{} diffusion instances: {}'.format(node.screen_name, diffusion_instances)) print('Total Diffusion Instances: {}'.format(total_diffusion_instances))
def delete_data(): """ Delete existing data """ print 'Delete all nodes and relationships...' query = 'MATCH (n) DETACH DELETE n' db.cypher_query(query)
def changeDp(user, photo): usernode = User.nodes.get(name=user) photonode = Photos.nodes.get(name=photo) query = "match (a:User {name:{no_node}})-[b:Dp]->(c) delete b" db.cypher_query(query, {"no_node": user}) usernode.currentdp.connect(photonode) return
def handle_noargs(self, **options): db.cypher_query( ''' MATCH (n)\ OPTIONAL MATCH (n)-[r]-()\ WITH n,r LIMIT 100000 DELETE n,r;\ ''' ) subprocess.call(["python", "manage.py", "setup_loc_environ"]) subprocess.call(["python", "manage.py", "setup_date_environ"])
def clean_up(): db.cypher_query( ''' MATCH (n)\ OPTIONAL MATCH (n)-[r]-()\ WITH n,r LIMIT 100000 DELETE n,r;\ ''' ) exs = list(RootLocation.nodes.all()) if not exs: RootLocation().save() exs = list(RootDate.nodes.all()) if not exs: RootDate().save()
def isRequest(current_user, request_user): query = "Match (a:User {name:{name1}})-[:Request]->(b:User {name:{name2}}) return a,b" result, columns = db.cypher_query(query, {"name1": current_user, "name2": request_user}) if result: return True else: return False
def get_top_k_cited_papers(request, name, year, k): query = "match (n1)-[:CITED]->(n2) " \ "where n2.journal='%s' and n2.year=%s " \ "return n2,count(n2) as count " \ "order by count desc " \ "limit %s" % (name, year, k) results, meta = db.cypher_query(query) return JsonResponse({"papers": [p.toDict() for p in [Article.inflate(row[0]) for row in results]] })
def getDp(user): query = "Match (a:User {name:{no_name}})-[:Dp]->(c) return c" value = "" result, columns = db.cypher_query(query, {"no_name": user}) for row in result: value = Photos.inflate(row[0]) # print value return value
def remove_weak_pictures(self): """Removes all pictures that are not RELATED_TO any interest.""" self.stdout.write("Removing all pictures that are not RELATED_TO any interest") query = "MATCH (p:Picture) OPTIONAL MATCH p--(i:Interest) WITH p, i WHERE i IS NULL DETACH DELETE p" results, meta = db.cypher_query(query)
def cipher_query(self, query): """ Execute normal neo4j queries """ from neomodel import db try: results, meta = db.cypher_query(query) except Exception as e: raise ("Failed to execute Cipher Query: " + query + "\n" + str(e)) return results
def remove_weak_interests(self): """Removes all interests that have less than 2 pictures RELATED_TO them.""" self.stdout.write("Removing all interests that have less than 2 pictures RELATED_TO them") query = "MATCH (i:Interest) OPTIONAL MATCH i<-[r:RELATED_TO]-(p:Picture) WITH i, COUNT(p) AS rel WHERE rel < 2 OR rel IS NULL DETACH DELETE i" results, meta = db.cypher_query(query)
def remove_duplicate_quests(self): skip = 0 while True: query = 'MATCH (q:Quest) ' \ 'RETURN DISTINCT q ' \ 'SKIP %s LIMIT 25' % skip skip += 24 res, _ = db.cypher_query(query) if not res.one: break for quest in [Quest.inflate(row[0]) for row in res]: query = 'MATCH (q:Quest {object_uuid:"%s"}) WHERE NOT ' \ '(q)-[]-(:Pleb) AND NOT (q)-[]-(:PublicOfficial) ' \ 'AND NOT (q)-[]-(:Mission) WITH q ' \ 'OPTIONAL MATCH (q)-[r]-() ' \ 'DELETE q, r' % (quest.object_uuid) res, _ = db.cypher_query(query) cache.set(self.cache_key, True)
def test_set_quest_about(self): self.quest.about = "some short summary" self.quest.save() setup_onboarding(self.quest, self.mission) query = 'MATCH (a:Mission {object_uuid: "%s"})-[:MUST_COMPLETE]->' \ '(task:OnboardingTask {title: "%s"}) RETURN task' % ( self.mission.object_uuid, settings.QUEST_ABOUT_TITLE) res, _ = db.cypher_query(query) self.assertTrue(res.one['completed'])
def test_set_bank_setup(self): self.quest.account_verified = "verified" self.quest.save() setup_onboarding(self.quest, self.mission) query = 'MATCH (a:Mission {object_uuid: "%s"})-[:MUST_COMPLETE]->' \ '(task:OnboardingTask {title: "%s"}) RETURN task' % ( self.mission.object_uuid, settings.BANK_SETUP_TITLE) res, _ = db.cypher_query(query) self.assertTrue(res.one['completed'])
def test_set_wallpaper(self): self.quest.wallpaper_pic = "something.png" self.quest.save() setup_onboarding(self.quest, self.mission) query = 'MATCH (a:Mission {object_uuid: "%s"})-[:MUST_COMPLETE]->' \ '(task:OnboardingTask {title: "%s"}) RETURN task' % ( self.mission.object_uuid, settings.QUEST_WALLPAPER_TITLE) res, _ = db.cypher_query(query) self.assertTrue(res.one['completed'])
def endorsed(self, request, owner_username): query = 'MATCH (q:Quest {owner_username:"******"})-' \ '[:ENDORSES]->(m:Mission) RETURN m' % owner_username res, _ = db.cypher_query(query) page = self.paginate_queryset( [Mission.inflate(mission[0]) for mission in res]) serializer = self.serializer_class(page, many=True, context={'request': request}) return self.get_paginated_response(serializer.data)
def delete_by_id(pk): query = ''' MATCH (n:Cipher) WHERE ID(n) = {id} DETACH DELETE n ''' results, meta = db.cypher_query(query, dict(id=pk)) print results print meta
def get_top_restaurant(uname): params = { 'uname': uname, } query = "MATCH (U:User {name:$uname})-[r:RATED]->(res:Restaurant) with max(r.rating) as MAX_RATING \ MATCH (U:User {name:$uname})-[r:RATED]->(res:Restaurant) where r.rating = MAX_RATING return res.rid order by rand() limit 2 " result, meta = db.cypher_query(query, params) return result
def getUser() -> User: dict = {'authUser': getAuth().email} results, columns = db.cypher_query( "MATCH (u:User) WHERE u.email <> {authUser} return u", params=dict) users = [User.inflate(row[0]) for row in results] userNames = [user.name for user in users] userChoosen = selectOptionInList( "Escoje el numero de usuario que deseas ver", userNames) return users[userChoosen]
def table_view(query): import pandas as pd results, columns = db.cypher_query(query) for line in results: for index, item in enumerate(line): if isinstance(item, Node): line[index] = item.__repr__() print(pd.DataFrame(results, columns=columns)) return results, columns
def clear_unseen(cls, username): """ Sets all the notifications for the given user to True so that there are no more unread notifications. Doesn't return anything because if the query fails a Cypher Exception is thrown and a 500 error will propagate out. :param username: :return: """ value = get_current_time().astimezone(pytz.utc) epoch_date = datetime(1970, 1, 1, tzinfo=pytz.utc) time_seen = float((value - epoch_date).total_seconds()) query = 'MATCH (a:Pleb {username: "******"})<-[:NOTIFICATION_TO]-' \ '(n:Notification) WHERE n.seen=False' \ ' SET n.seen = True, ' \ 'n.time_seen = %s' % (username, time_seen) db.cypher_query(query)
def coupon(self): if self.coupon_id: try: query = f"match (a) where ID(a) = {self.coupon_id} return a" result, meta = db.cypher_query(query) return result except Coupon.DoesNotExist: pass return None
def create_vote_relationship(content_id, voter_username, vote_active, vote_type): try: query = 'MATCH (v:VotableContent {object_uuid:"%s"}), ' \ '(p:Pleb {username:"******"}) ' \ 'CREATE UNIQUE (v)<-[vote:PLEB_VOTES]-(p) ' \ 'WITH v, vote, p SET vote.active=%s, ' \ 'vote.vote_type=%s RETURN v' % ( content_id, voter_username, vote_active, vote_type) res, _ = db.cypher_query(query) except (ConstraintViolation, Exception): query = 'MATCH (v:VotableContent {object_uuid:"%s"})' \ '<-[vote:PLEB_VOTES]-(p:Pleb {username:"******"}) ' \ 'SET vote.active=%s, vote.vote_type=%s RETURN v' % ( content_id, voter_username, vote_active, vote_type) res, _ = db.cypher_query(query) return res
def get_story_nodes(self, item, reverse=False, limit=False): """ Create a raw cypher query for story of an artifact and query neo4j with it. :param node item: a Neo4j node whose story is requested by the user :kwarg bool reverse: specifies the direction to proceed from current node corresponding to the story_flow :kwarg bool limit: specifies if LIMIT keyword should be added to the created cypher query :return: story paths for a particular artifact :rtype: list """ query = '' if reverse is True: rel_label = 'backward_relationship' node_label = 'backward_label' else: rel_label = 'forward_relationship' node_label = 'forward_label' curr_node_label = item.__label__ if curr_node_label not in self.story_flow_list: raise ValidationError( 'The story is not available for this kind of resource') while True: curr_node_info = self.story_flow(curr_node_label) if not curr_node_info: break if curr_node_label == item.__label__: query = """\ MATCH ({var}:{label}) WHERE id({var})= {node_id} CALL apoc.path.expandConfig({var}, {{sequence:\'{label} """.format(var=curr_node_label.lower(), label=curr_node_label, node_id=item.id) query += ', {0}, {1}'.format(curr_node_info[rel_label], curr_node_info[node_label]) curr_node_label = curr_node_info[node_label] if query: query += """\ \', minLevel:1}) YIELD path RETURN path ORDER BY length(path) DESC """ if query and limit: query += ' LIMIT 1' results = [] if query: results, _ = db.cypher_query(query) return results
def get_student_on_semester_for_fieldofstudy(self, student, fieldofstudy): results, meta = db.cypher_query( 'MATCH (s:Student)-[r:STUDIES]->(f:FieldOfStudy) WHERE (s.index_number=' + str(student.index_number) + ' and f.name=\"' + fieldofstudy.name + '\" and f.faculty=\"' + fieldofstudy.faculty + '\" and f.start_years=\"' + fieldofstudy.start_years + '\") RETURN r.on_semester') semester = [row[0] for row in results] return (semester or [None])[0]
def get_node_by_id(cls, id): labels = ''.join(label_string(cls.inherited_labels())) results = db.cypher_query( 'MATCH (n{}) WHERE ID(n)={{id}} RETURN n'.format(labels), dict(id=id) ) if len(results[0]) == 0: raise cls.DoesNotExist('No node found with given ID') return cls.inflate(results[0][0]['n'])
def get_rating_keyword_describes_course(self, keyword_course, course): results, meta = db.cypher_query( 'MATCH (k:Keyword)-[r:DESCRIBES]->(c:Course) WHERE (k.word=\"' + keyword_course.word + '\" AND c.name=\"' + course.name + '\") RETURN DISTINCT r.rating') value = [row[0] for row in results] if len(value) == 0: return False return value[0]
def get_all_fields_of_study_for_student(self, student): results, meta = db.cypher_query( 'MATCH (s:Student)-[r:STUDIES]->(f:FieldOfStudy) WHERE s.index_number=' + str(student.index_number) + ' RETURN f') field_of_study = [FieldOfStudy.inflate(row[0]) for row in results] if len(field_of_study) == 0: return None else: return field_of_study
def get_past_professors(self, student): field_of_study = self.get_field_of_study_for_student(student) results, meta = db.cypher_query( 'MATCH (p:Professor)-[r:TEACHES]->(c:Course)-[r2:HAS]->(f:FieldOfStudy) WHERE (f.name=\"' + field_of_study.name + '\" AND f.start_years=\"' + field_of_study.start_years + '\" AND f.faculty=\"' + field_of_study.faculty + '\" ) RETURN DISTINCT p') professors = [Professor.inflate(row[0]) for row in results] return professors
def get_course_by_field_of_study(self, name, field_of_study): # taught_on_semester, is_elective=is_elective results, meta = db.cypher_query( 'MATCH (c:Course)-[r:HAS]->(f:FieldOfStudy) WHERE (c.name=\"' + name + '\" AND f.name=\"' + field_of_study.name + '\" AND f.start_years=\"' + field_of_study.start_years + '\" and f.faculty=\"' + field_of_study.faculty + '\") RETURN c') course = [Course.inflate(row[0]) for row in results] return (course or [None])[0]
def existPhoto(photo, user): url = getUrl(user) + photo print url exists = [] query = "MATCH (a:Photos{name: {no_name}}) return a" results, columns = db.cypher_query(query, {"no_name": url}) if not results: return False else: return True
def existUser(user): exists = [] query = "MATCH (a:User{name: {no_name}}) return a" results, columns = db.cypher_query(query, {"no_name": user}) for row in results: exists = User.inflate(row[0]) if not exists: return False else: return True
def generic_update_rel(rel_class, request, labels, params, node_id): labels = labels.replace(')-[', ')-[r') query = 'MATCH ({}) WHERE ID(r)={{this}} RETURN r'.format(labels) query_params = dict(this=node_id) results, meta = db.cypher_query(query, query_params) rel = rel_class.inflate(results[0][0]) form = form_for_node_properties(rel, params.keys(), params) if form.is_valid(): set_node_properties_from_params(rel, form.cleaned_data) rel.save() return rel
def cypher(self, query): """ Execute normal neo4j queries """ from neomodel import db try: results, meta = db.cypher_query(query) except Exception as e: raise Exception( "Failed to execute Cypher Query: %s\n%s" % (query, str(e))) return False # logger.debug("Graph query.\nResults: %s\nMeta: %s" % (results, meta)) return results
def published_links(self): results = db.cypher_query( """ MATCH (n:Link)-[:ABOUT]->()<-[*]-(p:Person) WHERE ID(p)={id} AND n.publish_date IS NOT NULL RETURN DISTINCT n """, dict(id=self._id) ) return [ Link.inflate(result['n']) for result in results[0] ]
def published_experiences(self): results = db.cypher_query( """ MATCH (n:Experience)-[:WITH]->()<-[*]-(p:Person) WHERE ID(p)={id} AND n.publish_date IS NOT NULL RETURN DISTINCT n """, dict(id=self._id) ) return [ Experience.inflate(result['n']) for result in results[0] ]
def get_or_create(cls, to_person, from_person, word): query = """ MATCH (p:Person { address:'%s' }),(w:Word { value:'%s' }) MERGE (p)-[r:HEARD {name:'%s'}]->(w) RETURN r """ % (to_person.address, word.value, from_person.address) print(query) results, meta = db.cypher_query(query) print(results) heards = [Heard.inflate(row[0]) for row in results] print('heards: {}'.format(heards)) return heards[0]
def all_roles(self): results = db.cypher_query( """ MATCH (r:Role), (t:Topic) WHERE ID(t) = {id} AND (t)-[:RELATED_TO*1..]->(r) RETURN r """, dict(id=self._id) ) return [ Role.inflate(result['r']) for result in results[0] ]
def all_projects(self): results = db.cypher_query( """ MATCH (p:Project), (t:Topic) WHERE ID(t) = {id} AND (t)-[:RELATED_TO*1..]->(p) RETURN p """, dict(id=self._id) ) return [ Project.inflate(result['p']) for result in results[0] ]
def queryPubsOfAuthorOverTime_(name): query = "match (n1)-[:AUTHORED]->(n2) where n1.name='%s' return n2" % name results, meta = db.cypher_query(query) countPerYear = {} for row in results: article = Article.inflate(row[0]) name = article.journal year = article.year if countPerYear.has_key(year): countPerYear[year]+=1 else: countPerYear[year]=1 freq = [{"name": year, "frequency": count} for year, count in countPerYear.iteritems()] return freq
def resource_collection_response(cls, offset=0, limit=20): query = "MATCH (n) WHERE n:{label} RETURN n ORDER BY n.id SKIP {offset} LIMIT {limit}".format( label=cls.__name__, offset=offset, limit=limit) results, meta = db.cypher_query(query) data = dict() data['data'] = list() data['links'] = dict() data['links']['self'] = "{class_link}?page[offset]={offset}&page[limit]={limit}".format( class_link=cls.get_class_link(), offset=offset, limit=limit ) data['links']['first'] = "{class_link}?page[offset]={offset}&page[limit]={limit}".format( class_link=cls.get_class_link(), offset=0, limit=limit ) if int(offset) - int(limit) > 0: data['links']['prev'] = "{class_link}?page[offset]={offset}&page[limit]={limit}".format( class_link=cls.get_class_link(), offset=int(offset)-int(limit), limit=limit ) if len(cls.nodes) > int(offset) + int(limit): data['links']['next'] = "{class_link}?page[offset]={offset}&page[limit]={limit}".format( class_link=cls.get_class_link(), offset=int(offset)+int(limit), limit=limit ) data['links']['last'] = "{class_link}?page[offset]={offset}&page[limit]={limit}".format( class_link=cls.get_class_link(), offset=len(cls.nodes) - (len(cls.nodes) % int(limit)), limit=limit ) list_of_nodes = [cls.inflate(row[0]) for row in results] for this_node in list_of_nodes: data['data'].append(this_node.get_resource_object()) r = make_response(jsonify(data)) r.status_code = http_error_codes.OK r.headers['Content-Type'] = CONTENT_TYPE return r
def all_experiences(self): # There is a quirk in that a topic could be related to an experience, # or an experience could be with a topic. We want to find all such # experiences. results = db.cypher_query( """ MATCH (e:Experience), (t:Topic) WHERE ID(t) = {id} AND ((e)-[:WITH]->(t) OR (t)-[:RELATED_TO*1..]->(e)) RETURN e """, dict(id=self._id) ) return [ Experience.inflate(result['e']) for result in results[0] ]
def test_independent_property_name_get_or_create(): class TestNode(StructuredNode): uid = UniqueIdProperty() name_ = StringProperty(db_property="name", required=True) # create the node TestNode.get_or_create({'uid': 123, 'name_': 'jim'}) # test that the node is retrieved correctly x = TestNode.get_or_create({'uid': 123, 'name_': 'jim'})[0] # check database property name on low level results, meta = db.cypher_query("MATCH (n:TestNode) RETURN n") node_properties = _get_node_properties(results[0][0]) assert node_properties['name'] == "jim" assert 'name_' not in node_properties # delete node afterwards x.delete()
def test_independent_property_name(): class TestNode(StructuredNode): name_ = StringProperty(db_property="name") x = TestNode() x.name_ = "jim" x.save() # check database property name on low level results, meta = db.cypher_query("MATCH (n:TestNode) RETURN n") assert results[0][0].properties['name'] == "jim" assert not 'name_' in results[0][0].properties assert not hasattr(x, 'name') assert hasattr(x, 'name_') assert TestNode.nodes.filter(name_="jim").all()[0].name_ == x.name_ assert TestNode.nodes.get(name_="jim").name_ == x.name_ # delete node afterwards x.delete()
def queryPublicationsBetweenYears_(startYear, endYear): query = "match (n:Article) where n.year >= %s and n.year <= %s return n order by n.year" % (startYear, endYear) results, meta = db.cypher_query(query) journals = {} for row in results: article = Article.inflate(row[0]) name = article.journal year = article.year if not journals.has_key(name): journals[article.journal] = {} journalDist = journals[article.journal] if journalDist.has_key(article.year): journalDist[year] += 1 else: journalDist[year] = 1 pubYearDist = [] for name, journalDist in journals.iteritems(): pubYearDist.append({ "name":name, "articles": [[year,count] for year, count in journalDist.iteritems()], "total": sum(count for count in journalDist.itervalues()) }) return pubYearDist
def all_roles_and_dates(self): results = db.cypher_query( """ MATCH (person:Person)-[rel:PERFORMED]->(role:Role) WHERE id(person) = {id} RETURN role, rel.start_date as start_date, rel.end_date as end_date ORDER BY rel.start_date DESC, rel.end_date DESC """, dict(id=self._id) ) return [ ( Role.inflate(result['role']), parse8601(result['start_date']) if result['start_date'] else datetime.date.today(), parse8601(result['end_date']) if result['end_date'] else datetime.date.today(), ) for result in results[0] ]
def select_words(person_name): # find all words and total frequency min_freq = 20 stdev_weight = 6 words, query_items = db.cypher_query('match (w:Word)-[h:HEARD]-(p:Person) where p.address = \'{}\' ' 'OR h.name = \'{}\' return w.value, ' 'count(h.frequency)'.format(person_name, person_name)) print(words) words_to_remove = [] # remove words with freq too low appended_freq_list = [] ttl_freq_list = [] for word in list(words): word_freq = word[1] ttl_freq_list.append(word_freq) if word_freq < min_freq: #words.remove(word) words_to_remove.append(word[0]) logging.info('Removing word, freq < {}: {}'.format( min_freq, word[0] )) else: appended_freq_list.append(word_freq) print(words) # find mean of words stdev = statistics.stdev(ttl_freq_list) mean = statistics.mean(appended_freq_list) logging.info('stdev:{} and mean:{}'.format(stdev, mean)) # remove words with freq too high for word in list(words): word_freq = word[1] max_freq = mean+stdev_weight*stdev if word_freq > max_freq: #words.remove(word) words_to_remove.append(word[0]) logging.info('Removing word, freq > {}: {}'.format( max_freq, word[0] )) print('TO REMOVE, {} words: {}'.format( len(words_to_remove), words_to_remove)) print('ALL, {} words: {}'.format( len(words), words)) word_vals = [w[0] for w in words] # Deactivate words in DB word_nodes_dict = {n.value: n for n in Word.nodes.all()} for word, word_node in word_nodes_dict.items(): if word not in word_vals: continue # if word == 'streaming': # print('HERE') # sys.exit(1) state = not word in words_to_remove logging.debug('Word: {}, New: {}, Old: {}'.format( word, state, word_node.active )) if word_node.active != state: word_node.active = state word_node.save() logging.debug('Removed words: {}'.format(words_to_remove)) logging.debug('Important Words: {}'.format(words)) return words
def build_training_and_testing_sets(person_name): percent_training = 0.7 select_words(person_name) heard_recv, query_items = db.cypher_query( 'match (w:Word)-[h:HEARD]-(p:Person) where w.active = True and ' 'p.address=\'{}\' return w.value, ' 'h.frequency, h.name'.format(person_name) ) heard_sent, query_items = db.cypher_query( 'match (w:Word)-[h:HEARD]-(p:Person) where w.active = True and ' 'h.name=\'{}\' return w.value, ' 'h.frequency, p.address'.format(person_name) ) logging.info('All words, sent only: {}'.format(len(heard_sent))) logging.info('All words, received only: {}'.format(len(heard_recv))) # Merge operation heard_words = heard_recv + heard_sent heard_words.sort() logging.info('All words, combined: {}'.format(len(heard_words))) for i in range(len(heard_words)-1): if heard_words[i] == heard_words[i-1]: heard_words[i][1] += heard_words[i-1][1] heard_words[i-1][1] = heard_words[i][1] logging.debug('Merge: {}, {}'.format(heard_words[i], heard_words[i-1])) # Deduplicate list, frequencies already added heard_words = list(set(tuple(word) for word in heard_words)) logging.info('Unique words: {}'.format(len(list(set([w[0] for w in heard_words]))))) words, freq, people = list(zip(*heard_words)) distinct_people = list(set(people)) logging.debug('Distict people: {}'.format(distinct_people)) # find test and training matrices training_inx = round(len(distinct_people)*percent_training) logging.info('Chose {} people for training, {} for testing.'.format( training_inx, len(distinct_people) - training_inx )) logging.debug('Training people: {}'.format(distinct_people[:training_inx])) logging.debug('Testing people: {}'.format(distinct_people[training_inx:])) training_dict, training_relation = build_training_matrix( words, freq, people, distinct_people[:training_inx] ) testing_dict, testing_relation = build_testing_matrix( words, freq, people, distinct_people[training_inx:] ) # output training matrix to file train_filename = '{}.TRAIN'.format(re.search('%s(.*)%s' % ('<', '>'), person_name).group(1)) with open(train_filename, 'w') as train_file: writer = csv.writer(train_file, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(["%s " % person for person in distinct_people[:training_inx]]) writer.writerow(["%s " % entry for entry in training_dict]) for i in range(len(training_relation)): row = [training_dict[entry][i] for entry in training_dict] row.insert(0, training_relation[i]) writer.writerow(row) # output testing matrix to file test_filename = '{}.TEST'.format(re.search('%s(.*)%s' % ('<', '>'), person_name).group(1)) with open(test_filename, 'w') as test_file: writer = csv.writer(test_file, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) writer.writerow(["%s " % person for person in distinct_people[training_inx:]]) writer.writerow(["%s " % entry for entry in testing_dict]) for i in range(len(testing_relation)): row = [training_dict[entry][i] for entry in testing_dict] row.insert(0, testing_relation[i]) writer.writerow(row)