def filter_tweets_from_file(inputfilepath, outputfilepath=None, art=False, frequency=False, terms_to_remove=None): """ :param inputfilepath: string, required Location path for loading the raw tweets :param outputfilepath: string, optional, default: None Location path for saving the filtered tweets :param art: boolean, optional, default: False Remove articles, pronouns and prepositions :param frequency: boolean, optional, default: False, Remove less used words :param terms_to_remove: list of string, optional, default: None List of terms to remove from each tweet :return tweets: list of tweets cleaned and filtered """ tweets = filter_tweets(util.read_from_file(inputfilepath), outputfilepath, art, frequency, terms_to_remove) return tweets
def main(): if len(sys.argv) != 2: print("Wrong Info passed") sys.exit(1) output = [] words = sys.argv[1] index = read_from_file("index.dat") # find the file and occurance of all files from index for a word for word in words.split(' '): output.append(get_details(index, word)) # ranking = sum the frequency and sort result = rank(output) display(result)
def read_config(config_path): config = { "hostnames": [], "configs": { "CF_Account_ID": None, "CF_Zone_ID": None, "CF_Api_Token": None, "CF_Api_Key": None }, "query_interval_seconds": 30 } try: config_bytes = read_from_file(config_path, 'rb') config.update(json.loads(config_bytes)) except: pass return config
def kmeans(eac, removeTerms, ngram): terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos'] print('Filtering tweets') tweets = util.read_from_file("dataset.csv") if removeTerms: tweets = filter.filter_tweets(tweets, terms_to_remove=terms) else: tweets = filter.filter_tweets(tweets) # Reduce tweets list length tweets = tweets[0:6300] carlitos = 0 lazaro = 0 data = [] for tw in tweets: if tw.tw_type == 'Carlitos': carlitos += 1 else: lazaro += 1 data.append(tw.text) print(carlitos, lazaro) print("Transform Data...") # Transform data if ngram: hasher = HashingVectorizer(non_negative=True, ngram_range=(1, 3), analyzer='word', norm='l2', binary=False) else: hasher = HashingVectorizer(non_negative=True, norm='l2', binary=False) vectorizer = make_pipeline(hasher) X = vectorizer.fit_transform(data) count = 0 precision_list = [] while count < 100: # Start timer t0 = time() if eac: clustering = EAC(30, min_k=2, max_k=10) EAC_D = clustering.fit(X).distance_ # Kmedoids over EAC_D kmed = KMedoids(2, init='random', distance_metric="precomputed") labels = kmed.fit(EAC_D).labels_ else: km = KMeans(n_clusters=2, init='k-means++', n_init=1, max_iter=100) labels = km.fit(X).labels_ # Assign labels to tweets for i in range(len(tweets)): tweets[i].label = labels[i] print("Precision: ") # Print precision precision = util.precision(tweets) print("done in %0.3fs" % (time() - t0)) if isAdable(precision): precision_list.append(precision) count += 1 return precision_list
def minhash(eac, shingle, removeTerms): terms = ['lazaro', 'lázaro', 'baez', 'báez', 'carlitos'] print('Filtering tweets') tweets = util.read_from_file("dataset.csv") if removeTerms: tweets = filter.filter_tweets(tweets, terms_to_remove=terms) else: tweets = filter.filter_tweets(tweets) # Reduce tweets list length tweets = tweets[0:6300] carlitos = 0 lazaro = 0 data = [] for tw in tweets: if tw.tw_type == 'Carlitos': carlitos += 1 else: lazaro += 1 data.append(tw.text) print(carlitos, lazaro) # Extract text from tweets X = [tw.text for tw in tweets] # Start timer t0 = time() print("Calculating distance matrix...") D = metrics.jaccard_minhash_distance_mp(X, shingle_length=shingle) count = 0 precision_list = [] while count < 100: if eac: print("EAC clustering...") # EAC clustering kmedoid = KMedoids(init='random', distance_metric='precomputed') clustering = EAC(30, min_k=2, max_k=10, clustering=kmedoid) EAC_D = clustering.fit(D).distance_ # Kmedoids over EAC_D kmed = KMedoids(2, init='random', distance_metric="precomputed") labels = kmed.fit(EAC_D).labels_ else: kmedoid = KMedoids(2, init='random', distance_metric='precomputed') print("Kmedoids clustering...") labels = kmedoid.fit(D).labels_ # Assign labels to tweets for i in range(len(tweets)): tweets[i].label = labels[i] # Print precision print("Precision: ") precision = util.precision(tweets) if isAdable(precision): print(count) precision_list.append(precision) count += 1 print("done in %0.3fs" % (time() - t0)) return precision_list
node = nodes[node_name] ip_address = '10.10.20.%i/24' % i net_intf = SshJob(node=node, command=[ Run('mkdir', '/root/captures', '/root/errors'), Run('ifconfig', 'data', ip_address, 'up') ], required=load_images, scheduler=scheduler) net_intfs.append(net_intf) # ------------- # Installing Distrinet in the client/master node install_script = read_from_file('install_script.sh') install = SshJob(node=nodes[master], command=RunString(install_script), required=tuple(net_intfs), scheduler=scheduler) # ------------- # Loading image tarballs load_tarballs = SshJob(node=faraday, command=Run('scp', '-o StrictHostKeyChecking=no', '~/VoD/*.tar.gz', 'root@fit01:'), required=install, scheduler=scheduler)
import sys import csv from util import read_from_file, get_all_rooms, get_course_attr, get_readable_attrs, print_columns, time_set times, building, day = sys.argv[1:] day = day.upper() search_times = time_set(times) courses = read_from_file(building) rooms = get_all_rooms(courses) whole_day = set(range(0, 1440)) results = [] for room in rooms: courses_in_room = [c for c in courses if get_course_attr(c, 'room') == room and day in get_course_attr(c, 'days')] busy_times = set() for course in courses_in_room: busy_times |= time_set(get_course_attr(course, 'rawtime')) available_times = whole_day - busy_times if search_times <= available_times: results.append(room) print('Found {0} results'.format(len(results))) results = sorted(results, key=lambda x: x.rjust(99)) # Sort, putting shorter room numbers first for item in results: print(item)